1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanTransforms.h"
62 #include "llvm/ADT/APInt.h"
63 #include "llvm/ADT/ArrayRef.h"
64 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/DenseMapInfo.h"
66 #include "llvm/ADT/Hashing.h"
67 #include "llvm/ADT/MapVector.h"
68 #include "llvm/ADT/None.h"
69 #include "llvm/ADT/Optional.h"
70 #include "llvm/ADT/STLExtras.h"
71 #include "llvm/ADT/SmallPtrSet.h"
72 #include "llvm/ADT/SmallSet.h"
73 #include "llvm/ADT/SmallVector.h"
74 #include "llvm/ADT/Statistic.h"
75 #include "llvm/ADT/StringRef.h"
76 #include "llvm/ADT/Twine.h"
77 #include "llvm/ADT/iterator_range.h"
78 #include "llvm/Analysis/AssumptionCache.h"
79 #include "llvm/Analysis/BasicAliasAnalysis.h"
80 #include "llvm/Analysis/BlockFrequencyInfo.h"
81 #include "llvm/Analysis/CFG.h"
82 #include "llvm/Analysis/CodeMetrics.h"
83 #include "llvm/Analysis/DemandedBits.h"
84 #include "llvm/Analysis/GlobalsModRef.h"
85 #include "llvm/Analysis/LoopAccessAnalysis.h"
86 #include "llvm/Analysis/LoopAnalysisManager.h"
87 #include "llvm/Analysis/LoopInfo.h"
88 #include "llvm/Analysis/LoopIterator.h"
89 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
90 #include "llvm/Analysis/ProfileSummaryInfo.h"
91 #include "llvm/Analysis/ScalarEvolution.h"
92 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
93 #include "llvm/Analysis/TargetLibraryInfo.h"
94 #include "llvm/Analysis/TargetTransformInfo.h"
95 #include "llvm/Analysis/VectorUtils.h"
96 #include "llvm/IR/Attributes.h"
97 #include "llvm/IR/BasicBlock.h"
98 #include "llvm/IR/CFG.h"
99 #include "llvm/IR/Constant.h"
100 #include "llvm/IR/Constants.h"
101 #include "llvm/IR/DataLayout.h"
102 #include "llvm/IR/DebugInfoMetadata.h"
103 #include "llvm/IR/DebugLoc.h"
104 #include "llvm/IR/DerivedTypes.h"
105 #include "llvm/IR/DiagnosticInfo.h"
106 #include "llvm/IR/Dominators.h"
107 #include "llvm/IR/Function.h"
108 #include "llvm/IR/IRBuilder.h"
109 #include "llvm/IR/InstrTypes.h"
110 #include "llvm/IR/Instruction.h"
111 #include "llvm/IR/Instructions.h"
112 #include "llvm/IR/IntrinsicInst.h"
113 #include "llvm/IR/Intrinsics.h"
114 #include "llvm/IR/Metadata.h"
115 #include "llvm/IR/Module.h"
116 #include "llvm/IR/Operator.h"
117 #include "llvm/IR/PatternMatch.h"
118 #include "llvm/IR/Type.h"
119 #include "llvm/IR/Use.h"
120 #include "llvm/IR/User.h"
121 #include "llvm/IR/Value.h"
122 #include "llvm/IR/ValueHandle.h"
123 #include "llvm/IR/Verifier.h"
124 #include "llvm/InitializePasses.h"
125 #include "llvm/Pass.h"
126 #include "llvm/Support/Casting.h"
127 #include "llvm/Support/CommandLine.h"
128 #include "llvm/Support/Compiler.h"
129 #include "llvm/Support/Debug.h"
130 #include "llvm/Support/ErrorHandling.h"
131 #include "llvm/Support/InstructionCost.h"
132 #include "llvm/Support/MathExtras.h"
133 #include "llvm/Support/raw_ostream.h"
134 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
135 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
136 #include "llvm/Transforms/Utils/LoopSimplify.h"
137 #include "llvm/Transforms/Utils/LoopUtils.h"
138 #include "llvm/Transforms/Utils/LoopVersioning.h"
139 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
140 #include "llvm/Transforms/Utils/SizeOpts.h"
141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
142 #include <algorithm>
143 #include <cassert>
144 #include <cstdint>
145 #include <functional>
146 #include <iterator>
147 #include <limits>
148 #include <map>
149 #include <memory>
150 #include <string>
151 #include <tuple>
152 #include <utility>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 #ifndef NDEBUG
160 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
161 #endif
162 
163 /// @{
164 /// Metadata attribute names
165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
166 const char LLVMLoopVectorizeFollowupVectorized[] =
167     "llvm.loop.vectorize.followup_vectorized";
168 const char LLVMLoopVectorizeFollowupEpilogue[] =
169     "llvm.loop.vectorize.followup_epilogue";
170 /// @}
171 
172 STATISTIC(LoopsVectorized, "Number of loops vectorized");
173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
175 
176 static cl::opt<bool> EnableEpilogueVectorization(
177     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
178     cl::desc("Enable vectorization of epilogue loops."));
179 
180 static cl::opt<unsigned> EpilogueVectorizationForceVF(
181     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
182     cl::desc("When epilogue vectorization is enabled, and a value greater than "
183              "1 is specified, forces the given VF for all applicable epilogue "
184              "loops."));
185 
186 static cl::opt<unsigned> EpilogueVectorizationMinVF(
187     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
188     cl::desc("Only loops with vectorization factor equal to or larger than "
189              "the specified value are considered for epilogue vectorization."));
190 
191 /// Loops with a known constant trip count below this number are vectorized only
192 /// if no scalar iteration overheads are incurred.
193 static cl::opt<unsigned> TinyTripCountVectorThreshold(
194     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
195     cl::desc("Loops with a constant trip count that is smaller than this "
196              "value are vectorized only if no scalar iteration overheads "
197              "are incurred."));
198 
199 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
200     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
201     cl::desc("The maximum allowed number of runtime memory checks"));
202 
203 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
204 // that predication is preferred, and this lists all options. I.e., the
205 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
206 // and predicate the instructions accordingly. If tail-folding fails, there are
207 // different fallback strategies depending on these values:
208 namespace PreferPredicateTy {
209   enum Option {
210     ScalarEpilogue = 0,
211     PredicateElseScalarEpilogue,
212     PredicateOrDontVectorize
213   };
214 } // namespace PreferPredicateTy
215 
216 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
217     "prefer-predicate-over-epilogue",
218     cl::init(PreferPredicateTy::ScalarEpilogue),
219     cl::Hidden,
220     cl::desc("Tail-folding and predication preferences over creating a scalar "
221              "epilogue loop."),
222     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
223                          "scalar-epilogue",
224                          "Don't tail-predicate loops, create scalar epilogue"),
225               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
226                          "predicate-else-scalar-epilogue",
227                          "prefer tail-folding, create scalar epilogue if tail "
228                          "folding fails."),
229               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
230                          "predicate-dont-vectorize",
231                          "prefers tail-folding, don't attempt vectorization if "
232                          "tail-folding fails.")));
233 
234 static cl::opt<bool> MaximizeBandwidth(
235     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
236     cl::desc("Maximize bandwidth when selecting vectorization factor which "
237              "will be determined by the smallest type in loop."));
238 
239 static cl::opt<bool> EnableInterleavedMemAccesses(
240     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
241     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
242 
243 /// An interleave-group may need masking if it resides in a block that needs
244 /// predication, or in order to mask away gaps.
245 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
246     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
247     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
248 
249 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
250     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
251     cl::desc("We don't interleave loops with a estimated constant trip count "
252              "below this number"));
253 
254 static cl::opt<unsigned> ForceTargetNumScalarRegs(
255     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
256     cl::desc("A flag that overrides the target's number of scalar registers."));
257 
258 static cl::opt<unsigned> ForceTargetNumVectorRegs(
259     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
260     cl::desc("A flag that overrides the target's number of vector registers."));
261 
262 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
263     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
264     cl::desc("A flag that overrides the target's max interleave factor for "
265              "scalar loops."));
266 
267 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
268     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
269     cl::desc("A flag that overrides the target's max interleave factor for "
270              "vectorized loops."));
271 
272 static cl::opt<unsigned> ForceTargetInstructionCost(
273     "force-target-instruction-cost", cl::init(0), cl::Hidden,
274     cl::desc("A flag that overrides the target's expected cost for "
275              "an instruction to a single constant value. Mostly "
276              "useful for getting consistent testing."));
277 
278 static cl::opt<bool> ForceTargetSupportsScalableVectors(
279     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
280     cl::desc(
281         "Pretend that scalable vectors are supported, even if the target does "
282         "not support them. This flag should only be used for testing."));
283 
284 static cl::opt<unsigned> SmallLoopCost(
285     "small-loop-cost", cl::init(20), cl::Hidden,
286     cl::desc(
287         "The cost of a loop that is considered 'small' by the interleaver."));
288 
289 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
290     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
291     cl::desc("Enable the use of the block frequency analysis to access PGO "
292              "heuristics minimizing code growth in cold regions and being more "
293              "aggressive in hot regions."));
294 
295 // Runtime interleave loops for load/store throughput.
296 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
297     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
298     cl::desc(
299         "Enable runtime interleaving until load/store ports are saturated"));
300 
301 /// Interleave small loops with scalar reductions.
302 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
303     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
304     cl::desc("Enable interleaving for loops with small iteration counts that "
305              "contain scalar reductions to expose ILP."));
306 
307 /// The number of stores in a loop that are allowed to need predication.
308 static cl::opt<unsigned> NumberOfStoresToPredicate(
309     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
310     cl::desc("Max number of stores to be predicated behind an if."));
311 
312 static cl::opt<bool> EnableIndVarRegisterHeur(
313     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
314     cl::desc("Count the induction variable only once when interleaving"));
315 
316 static cl::opt<bool> EnableCondStoresVectorization(
317     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
318     cl::desc("Enable if predication of stores during vectorization."));
319 
320 static cl::opt<unsigned> MaxNestedScalarReductionIC(
321     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
322     cl::desc("The maximum interleave count to use when interleaving a scalar "
323              "reduction in a nested loop."));
324 
325 static cl::opt<bool>
326     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
327                            cl::Hidden,
328                            cl::desc("Prefer in-loop vector reductions, "
329                                     "overriding the targets preference."));
330 
331 static cl::opt<bool> ForceOrderedReductions(
332     "force-ordered-reductions", cl::init(false), cl::Hidden,
333     cl::desc("Enable the vectorisation of loops with in-order (strict) "
334              "FP reductions"));
335 
336 static cl::opt<bool> PreferPredicatedReductionSelect(
337     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
338     cl::desc(
339         "Prefer predicating a reduction operation over an after loop select."));
340 
341 cl::opt<bool> EnableVPlanNativePath(
342     "enable-vplan-native-path", cl::init(false), cl::Hidden,
343     cl::desc("Enable VPlan-native vectorization path with "
344              "support for outer loop vectorization."));
345 
346 // This flag enables the stress testing of the VPlan H-CFG construction in the
347 // VPlan-native vectorization path. It must be used in conjuction with
348 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
349 // verification of the H-CFGs built.
350 static cl::opt<bool> VPlanBuildStressTest(
351     "vplan-build-stress-test", cl::init(false), cl::Hidden,
352     cl::desc(
353         "Build VPlan for every supported loop nest in the function and bail "
354         "out right after the build (stress test the VPlan H-CFG construction "
355         "in the VPlan-native vectorization path)."));
356 
357 cl::opt<bool> llvm::EnableLoopInterleaving(
358     "interleave-loops", cl::init(true), cl::Hidden,
359     cl::desc("Enable loop interleaving in Loop vectorization passes"));
360 cl::opt<bool> llvm::EnableLoopVectorization(
361     "vectorize-loops", cl::init(true), cl::Hidden,
362     cl::desc("Run the Loop vectorization passes"));
363 
364 cl::opt<bool> PrintVPlansInDotFormat(
365     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
366     cl::desc("Use dot format instead of plain text when dumping VPlans"));
367 
368 /// A helper function that returns true if the given type is irregular. The
369 /// type is irregular if its allocated size doesn't equal the store size of an
370 /// element of the corresponding vector type.
371 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
372   // Determine if an array of N elements of type Ty is "bitcast compatible"
373   // with a <N x Ty> vector.
374   // This is only true if there is no padding between the array elements.
375   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
376 }
377 
378 /// A helper function that returns the reciprocal of the block probability of
379 /// predicated blocks. If we return X, we are assuming the predicated block
380 /// will execute once for every X iterations of the loop header.
381 ///
382 /// TODO: We should use actual block probability here, if available. Currently,
383 ///       we always assume predicated blocks have a 50% chance of executing.
384 static unsigned getReciprocalPredBlockProb() { return 2; }
385 
386 /// A helper function that returns an integer or floating-point constant with
387 /// value C.
388 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
389   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
390                            : ConstantFP::get(Ty, C);
391 }
392 
393 /// Returns "best known" trip count for the specified loop \p L as defined by
394 /// the following procedure:
395 ///   1) Returns exact trip count if it is known.
396 ///   2) Returns expected trip count according to profile data if any.
397 ///   3) Returns upper bound estimate if it is known.
398 ///   4) Returns None if all of the above failed.
399 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
400   // Check if exact trip count is known.
401   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
402     return ExpectedTC;
403 
404   // Check if there is an expected trip count available from profile data.
405   if (LoopVectorizeWithBlockFrequency)
406     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
407       return EstimatedTC;
408 
409   // Check if upper bound estimate is known.
410   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
411     return ExpectedTC;
412 
413   return None;
414 }
415 
416 // Forward declare GeneratedRTChecks.
417 class GeneratedRTChecks;
418 
419 namespace llvm {
420 
421 AnalysisKey ShouldRunExtraVectorPasses::Key;
422 
423 /// InnerLoopVectorizer vectorizes loops which contain only one basic
424 /// block to a specified vectorization factor (VF).
425 /// This class performs the widening of scalars into vectors, or multiple
426 /// scalars. This class also implements the following features:
427 /// * It inserts an epilogue loop for handling loops that don't have iteration
428 ///   counts that are known to be a multiple of the vectorization factor.
429 /// * It handles the code generation for reduction variables.
430 /// * Scalarization (implementation using scalars) of un-vectorizable
431 ///   instructions.
432 /// InnerLoopVectorizer does not perform any vectorization-legality
433 /// checks, and relies on the caller to check for the different legality
434 /// aspects. The InnerLoopVectorizer relies on the
435 /// LoopVectorizationLegality class to provide information about the induction
436 /// and reduction variables that were found to a given vectorization factor.
437 class InnerLoopVectorizer {
438 public:
439   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
440                       LoopInfo *LI, DominatorTree *DT,
441                       const TargetLibraryInfo *TLI,
442                       const TargetTransformInfo *TTI, AssumptionCache *AC,
443                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
444                       ElementCount MinProfitableTripCount,
445                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
446                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
447                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
448       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
449         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
450         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
451         PSI(PSI), RTChecks(RTChecks) {
452     // Query this against the original loop and save it here because the profile
453     // of the original loop header may change as the transformation happens.
454     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
455         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
456 
457     if (MinProfitableTripCount.isZero())
458       this->MinProfitableTripCount = VecWidth;
459     else
460       this->MinProfitableTripCount = MinProfitableTripCount;
461   }
462 
463   virtual ~InnerLoopVectorizer() = default;
464 
465   /// Create a new empty loop that will contain vectorized instructions later
466   /// on, while the old loop will be used as the scalar remainder. Control flow
467   /// is generated around the vectorized (and scalar epilogue) loops consisting
468   /// of various checks and bypasses. Return the pre-header block of the new
469   /// loop and the start value for the canonical induction, if it is != 0. The
470   /// latter is the case when vectorizing the epilogue loop. In the case of
471   /// epilogue vectorization, this function is overriden to handle the more
472   /// complex control flow around the loops.
473   virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
474 
475   /// Widen a single call instruction within the innermost loop.
476   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
477                             VPTransformState &State);
478 
479   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
480   void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
481 
482   // Return true if any runtime check is added.
483   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
484 
485   /// A type for vectorized values in the new loop. Each value from the
486   /// original loop, when vectorized, is represented by UF vector values in the
487   /// new unrolled loop, where UF is the unroll factor.
488   using VectorParts = SmallVector<Value *, 2>;
489 
490   /// A helper function to scalarize a single Instruction in the innermost loop.
491   /// Generates a sequence of scalar instances for each lane between \p MinLane
492   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
493   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
494   /// Instr's operands.
495   void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
496                             const VPIteration &Instance, bool IfPredicateInstr,
497                             VPTransformState &State);
498 
499   /// Construct the vector value of a scalarized value \p V one lane at a time.
500   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
501                                  VPTransformState &State);
502 
503   /// Try to vectorize interleaved access group \p Group with the base address
504   /// given in \p Addr, optionally masking the vector operations if \p
505   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
506   /// values in the vectorized loop.
507   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
508                                 ArrayRef<VPValue *> VPDefs,
509                                 VPTransformState &State, VPValue *Addr,
510                                 ArrayRef<VPValue *> StoredValues,
511                                 VPValue *BlockInMask = nullptr);
512 
513   /// Fix the non-induction PHIs in \p Plan.
514   void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
515 
516   /// Returns true if the reordering of FP operations is not allowed, but we are
517   /// able to vectorize with strict in-order reductions for the given RdxDesc.
518   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
519 
520   /// Create a broadcast instruction. This method generates a broadcast
521   /// instruction (shuffle) for loop invariant values and for the induction
522   /// value. If this is the induction variable then we extend it to N, N+1, ...
523   /// this is needed because each iteration in the loop corresponds to a SIMD
524   /// element.
525   virtual Value *getBroadcastInstrs(Value *V);
526 
527   // Returns the resume value (bc.merge.rdx) for a reduction as
528   // generated by fixReduction.
529   PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
530 
531 protected:
532   friend class LoopVectorizationPlanner;
533 
534   /// A small list of PHINodes.
535   using PhiVector = SmallVector<PHINode *, 4>;
536 
537   /// A type for scalarized values in the new loop. Each value from the
538   /// original loop, when scalarized, is represented by UF x VF scalar values
539   /// in the new unrolled loop, where UF is the unroll factor and VF is the
540   /// vectorization factor.
541   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
542 
543   /// Set up the values of the IVs correctly when exiting the vector loop.
544   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
545                     Value *VectorTripCount, Value *EndValue,
546                     BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
547                     VPlan &Plan);
548 
549   /// Handle all cross-iteration phis in the header.
550   void fixCrossIterationPHIs(VPTransformState &State);
551 
552   /// Create the exit value of first order recurrences in the middle block and
553   /// update their users.
554   void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
555                                VPTransformState &State);
556 
557   /// Create code for the loop exit value of the reduction.
558   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
559 
560   /// Clear NSW/NUW flags from reduction instructions if necessary.
561   void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
562                                VPTransformState &State);
563 
564   /// Iteratively sink the scalarized operands of a predicated instruction into
565   /// the block that was created for it.
566   void sinkScalarOperands(Instruction *PredInst);
567 
568   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
569   /// represented as.
570   void truncateToMinimalBitwidths(VPTransformState &State);
571 
572   /// Returns (and creates if needed) the original loop trip count.
573   Value *getOrCreateTripCount(BasicBlock *InsertBlock);
574 
575   /// Returns (and creates if needed) the trip count of the widened loop.
576   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
577 
578   /// Returns a bitcasted value to the requested vector type.
579   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
580   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
581                                 const DataLayout &DL);
582 
583   /// Emit a bypass check to see if the vector trip count is zero, including if
584   /// it overflows.
585   void emitIterationCountCheck(BasicBlock *Bypass);
586 
587   /// Emit a bypass check to see if all of the SCEV assumptions we've
588   /// had to make are correct. Returns the block containing the checks or
589   /// nullptr if no checks have been added.
590   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
591 
592   /// Emit bypass checks to check any memory assumptions we may have made.
593   /// Returns the block containing the checks or nullptr if no checks have been
594   /// added.
595   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
596 
597   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
598   /// vector loop preheader, middle block and scalar preheader.
599   void createVectorLoopSkeleton(StringRef Prefix);
600 
601   /// Create new phi nodes for the induction variables to resume iteration count
602   /// in the scalar epilogue, from where the vectorized loop left off.
603   /// In cases where the loop skeleton is more complicated (eg. epilogue
604   /// vectorization) and the resume values can come from an additional bypass
605   /// block, the \p AdditionalBypass pair provides information about the bypass
606   /// block and the end value on the edge from bypass to this loop.
607   void createInductionResumeValues(
608       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
609 
610   /// Complete the loop skeleton by adding debug MDs, creating appropriate
611   /// conditional branches in the middle block, preparing the builder and
612   /// running the verifier. Return the preheader of the completed vector loop.
613   BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID);
614 
615   /// Collect poison-generating recipes that may generate a poison value that is
616   /// used after vectorization, even when their operands are not poison. Those
617   /// recipes meet the following conditions:
618   ///  * Contribute to the address computation of a recipe generating a widen
619   ///    memory load/store (VPWidenMemoryInstructionRecipe or
620   ///    VPInterleaveRecipe).
621   ///  * Such a widen memory load/store has at least one underlying Instruction
622   ///    that is in a basic block that needs predication and after vectorization
623   ///    the generated instruction won't be predicated.
624   void collectPoisonGeneratingRecipes(VPTransformState &State);
625 
626   /// Allow subclasses to override and print debug traces before/after vplan
627   /// execution, when trace information is requested.
628   virtual void printDebugTracesAtStart(){};
629   virtual void printDebugTracesAtEnd(){};
630 
631   /// The original loop.
632   Loop *OrigLoop;
633 
634   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
635   /// dynamic knowledge to simplify SCEV expressions and converts them to a
636   /// more usable form.
637   PredicatedScalarEvolution &PSE;
638 
639   /// Loop Info.
640   LoopInfo *LI;
641 
642   /// Dominator Tree.
643   DominatorTree *DT;
644 
645   /// Alias Analysis.
646   AAResults *AA;
647 
648   /// Target Library Info.
649   const TargetLibraryInfo *TLI;
650 
651   /// Target Transform Info.
652   const TargetTransformInfo *TTI;
653 
654   /// Assumption Cache.
655   AssumptionCache *AC;
656 
657   /// Interface to emit optimization remarks.
658   OptimizationRemarkEmitter *ORE;
659 
660   /// The vectorization SIMD factor to use. Each vector will have this many
661   /// vector elements.
662   ElementCount VF;
663 
664   ElementCount MinProfitableTripCount;
665 
666   /// The vectorization unroll factor to use. Each scalar is vectorized to this
667   /// many different vector instructions.
668   unsigned UF;
669 
670   /// The builder that we use
671   IRBuilder<> Builder;
672 
673   // --- Vectorization state ---
674 
675   /// The vector-loop preheader.
676   BasicBlock *LoopVectorPreHeader;
677 
678   /// The scalar-loop preheader.
679   BasicBlock *LoopScalarPreHeader;
680 
681   /// Middle Block between the vector and the scalar.
682   BasicBlock *LoopMiddleBlock;
683 
684   /// The unique ExitBlock of the scalar loop if one exists.  Note that
685   /// there can be multiple exiting edges reaching this block.
686   BasicBlock *LoopExitBlock;
687 
688   /// The scalar loop body.
689   BasicBlock *LoopScalarBody;
690 
691   /// A list of all bypass blocks. The first block is the entry of the loop.
692   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
693 
694   /// Store instructions that were predicated.
695   SmallVector<Instruction *, 4> PredicatedInstructions;
696 
697   /// Trip count of the original loop.
698   Value *TripCount = nullptr;
699 
700   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
701   Value *VectorTripCount = nullptr;
702 
703   /// The legality analysis.
704   LoopVectorizationLegality *Legal;
705 
706   /// The profitablity analysis.
707   LoopVectorizationCostModel *Cost;
708 
709   // Record whether runtime checks are added.
710   bool AddedSafetyChecks = false;
711 
712   // Holds the end values for each induction variable. We save the end values
713   // so we can later fix-up the external users of the induction variables.
714   DenseMap<PHINode *, Value *> IVEndValues;
715 
716   /// BFI and PSI are used to check for profile guided size optimizations.
717   BlockFrequencyInfo *BFI;
718   ProfileSummaryInfo *PSI;
719 
720   // Whether this loop should be optimized for size based on profile guided size
721   // optimizatios.
722   bool OptForSizeBasedOnProfile;
723 
724   /// Structure to hold information about generated runtime checks, responsible
725   /// for cleaning the checks, if vectorization turns out unprofitable.
726   GeneratedRTChecks &RTChecks;
727 
728   // Holds the resume values for reductions in the loops, used to set the
729   // correct start value of reduction PHIs when vectorizing the epilogue.
730   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
731       ReductionResumeValues;
732 };
733 
734 class InnerLoopUnroller : public InnerLoopVectorizer {
735 public:
736   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
737                     LoopInfo *LI, DominatorTree *DT,
738                     const TargetLibraryInfo *TLI,
739                     const TargetTransformInfo *TTI, AssumptionCache *AC,
740                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
741                     LoopVectorizationLegality *LVL,
742                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
743                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
744       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
745                             ElementCount::getFixed(1),
746                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
747                             BFI, PSI, Check) {}
748 
749 private:
750   Value *getBroadcastInstrs(Value *V) override;
751 };
752 
753 /// Encapsulate information regarding vectorization of a loop and its epilogue.
754 /// This information is meant to be updated and used across two stages of
755 /// epilogue vectorization.
756 struct EpilogueLoopVectorizationInfo {
757   ElementCount MainLoopVF = ElementCount::getFixed(0);
758   unsigned MainLoopUF = 0;
759   ElementCount EpilogueVF = ElementCount::getFixed(0);
760   unsigned EpilogueUF = 0;
761   BasicBlock *MainLoopIterationCountCheck = nullptr;
762   BasicBlock *EpilogueIterationCountCheck = nullptr;
763   BasicBlock *SCEVSafetyCheck = nullptr;
764   BasicBlock *MemSafetyCheck = nullptr;
765   Value *TripCount = nullptr;
766   Value *VectorTripCount = nullptr;
767 
768   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
769                                 ElementCount EVF, unsigned EUF)
770       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
771     assert(EUF == 1 &&
772            "A high UF for the epilogue loop is likely not beneficial.");
773   }
774 };
775 
776 /// An extension of the inner loop vectorizer that creates a skeleton for a
777 /// vectorized loop that has its epilogue (residual) also vectorized.
778 /// The idea is to run the vplan on a given loop twice, firstly to setup the
779 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
780 /// from the first step and vectorize the epilogue.  This is achieved by
781 /// deriving two concrete strategy classes from this base class and invoking
782 /// them in succession from the loop vectorizer planner.
783 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
784 public:
785   InnerLoopAndEpilogueVectorizer(
786       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
787       DominatorTree *DT, const TargetLibraryInfo *TLI,
788       const TargetTransformInfo *TTI, AssumptionCache *AC,
789       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
790       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
791       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
792       GeneratedRTChecks &Checks)
793       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
794                             EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
795                             CM, BFI, PSI, Checks),
796         EPI(EPI) {}
797 
798   // Override this function to handle the more complex control flow around the
799   // three loops.
800   std::pair<BasicBlock *, Value *>
801   createVectorizedLoopSkeleton() final override {
802     return createEpilogueVectorizedLoopSkeleton();
803   }
804 
805   /// The interface for creating a vectorized skeleton using one of two
806   /// different strategies, each corresponding to one execution of the vplan
807   /// as described above.
808   virtual std::pair<BasicBlock *, Value *>
809   createEpilogueVectorizedLoopSkeleton() = 0;
810 
811   /// Holds and updates state information required to vectorize the main loop
812   /// and its epilogue in two separate passes. This setup helps us avoid
813   /// regenerating and recomputing runtime safety checks. It also helps us to
814   /// shorten the iteration-count-check path length for the cases where the
815   /// iteration count of the loop is so small that the main vector loop is
816   /// completely skipped.
817   EpilogueLoopVectorizationInfo &EPI;
818 };
819 
820 /// A specialized derived class of inner loop vectorizer that performs
821 /// vectorization of *main* loops in the process of vectorizing loops and their
822 /// epilogues.
823 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
824 public:
825   EpilogueVectorizerMainLoop(
826       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
827       DominatorTree *DT, const TargetLibraryInfo *TLI,
828       const TargetTransformInfo *TTI, AssumptionCache *AC,
829       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
830       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
831       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
832       GeneratedRTChecks &Check)
833       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
834                                        EPI, LVL, CM, BFI, PSI, Check) {}
835   /// Implements the interface for creating a vectorized skeleton using the
836   /// *main loop* strategy (ie the first pass of vplan execution).
837   std::pair<BasicBlock *, Value *>
838   createEpilogueVectorizedLoopSkeleton() final override;
839 
840 protected:
841   /// Emits an iteration count bypass check once for the main loop (when \p
842   /// ForEpilogue is false) and once for the epilogue loop (when \p
843   /// ForEpilogue is true).
844   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
845   void printDebugTracesAtStart() override;
846   void printDebugTracesAtEnd() override;
847 };
848 
849 // A specialized derived class of inner loop vectorizer that performs
850 // vectorization of *epilogue* loops in the process of vectorizing loops and
851 // their epilogues.
852 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
853 public:
854   EpilogueVectorizerEpilogueLoop(
855       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
856       DominatorTree *DT, const TargetLibraryInfo *TLI,
857       const TargetTransformInfo *TTI, AssumptionCache *AC,
858       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
859       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
860       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
861       GeneratedRTChecks &Checks)
862       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
863                                        EPI, LVL, CM, BFI, PSI, Checks) {
864     TripCount = EPI.TripCount;
865   }
866   /// Implements the interface for creating a vectorized skeleton using the
867   /// *epilogue loop* strategy (ie the second pass of vplan execution).
868   std::pair<BasicBlock *, Value *>
869   createEpilogueVectorizedLoopSkeleton() final override;
870 
871 protected:
872   /// Emits an iteration count bypass check after the main vector loop has
873   /// finished to see if there are any iterations left to execute by either
874   /// the vector epilogue or the scalar epilogue.
875   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
876                                                       BasicBlock *Bypass,
877                                                       BasicBlock *Insert);
878   void printDebugTracesAtStart() override;
879   void printDebugTracesAtEnd() override;
880 };
881 } // end namespace llvm
882 
883 /// Look for a meaningful debug location on the instruction or it's
884 /// operands.
885 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
886   if (!I)
887     return I;
888 
889   DebugLoc Empty;
890   if (I->getDebugLoc() != Empty)
891     return I;
892 
893   for (Use &Op : I->operands()) {
894     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
895       if (OpInst->getDebugLoc() != Empty)
896         return OpInst;
897   }
898 
899   return I;
900 }
901 
902 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
903 /// is passed, the message relates to that particular instruction.
904 #ifndef NDEBUG
905 static void debugVectorizationMessage(const StringRef Prefix,
906                                       const StringRef DebugMsg,
907                                       Instruction *I) {
908   dbgs() << "LV: " << Prefix << DebugMsg;
909   if (I != nullptr)
910     dbgs() << " " << *I;
911   else
912     dbgs() << '.';
913   dbgs() << '\n';
914 }
915 #endif
916 
917 /// Create an analysis remark that explains why vectorization failed
918 ///
919 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
920 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
921 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
922 /// the location of the remark.  \return the remark object that can be
923 /// streamed to.
924 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
925     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
926   Value *CodeRegion = TheLoop->getHeader();
927   DebugLoc DL = TheLoop->getStartLoc();
928 
929   if (I) {
930     CodeRegion = I->getParent();
931     // If there is no debug location attached to the instruction, revert back to
932     // using the loop's.
933     if (I->getDebugLoc())
934       DL = I->getDebugLoc();
935   }
936 
937   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
938 }
939 
940 namespace llvm {
941 
942 /// Return a value for Step multiplied by VF.
943 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
944                        int64_t Step) {
945   assert(Ty->isIntegerTy() && "Expected an integer step");
946   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
947   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
948 }
949 
950 /// Return the runtime value for VF.
951 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
952   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
953   return VF.isScalable() ? B.CreateVScale(EC) : EC;
954 }
955 
956 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
957                                   ElementCount VF) {
958   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
959   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
960   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
961   return B.CreateUIToFP(RuntimeVF, FTy);
962 }
963 
964 void reportVectorizationFailure(const StringRef DebugMsg,
965                                 const StringRef OREMsg, const StringRef ORETag,
966                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
967                                 Instruction *I) {
968   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
969   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
970   ORE->emit(
971       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
972       << "loop not vectorized: " << OREMsg);
973 }
974 
975 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
976                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
977                              Instruction *I) {
978   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
979   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
980   ORE->emit(
981       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
982       << Msg);
983 }
984 
985 } // end namespace llvm
986 
987 #ifndef NDEBUG
988 /// \return string containing a file name and a line # for the given loop.
989 static std::string getDebugLocString(const Loop *L) {
990   std::string Result;
991   if (L) {
992     raw_string_ostream OS(Result);
993     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
994       LoopDbgLoc.print(OS);
995     else
996       // Just print the module name.
997       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
998     OS.flush();
999   }
1000   return Result;
1001 }
1002 #endif
1003 
1004 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1005     VPTransformState &State) {
1006 
1007   // Collect recipes in the backward slice of `Root` that may generate a poison
1008   // value that is used after vectorization.
1009   SmallPtrSet<VPRecipeBase *, 16> Visited;
1010   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1011     SmallVector<VPRecipeBase *, 16> Worklist;
1012     Worklist.push_back(Root);
1013 
1014     // Traverse the backward slice of Root through its use-def chain.
1015     while (!Worklist.empty()) {
1016       VPRecipeBase *CurRec = Worklist.back();
1017       Worklist.pop_back();
1018 
1019       if (!Visited.insert(CurRec).second)
1020         continue;
1021 
1022       // Prune search if we find another recipe generating a widen memory
1023       // instruction. Widen memory instructions involved in address computation
1024       // will lead to gather/scatter instructions, which don't need to be
1025       // handled.
1026       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1027           isa<VPInterleaveRecipe>(CurRec) ||
1028           isa<VPScalarIVStepsRecipe>(CurRec) ||
1029           isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1030           isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1031         continue;
1032 
1033       // This recipe contributes to the address computation of a widen
1034       // load/store. Collect recipe if its underlying instruction has
1035       // poison-generating flags.
1036       Instruction *Instr = CurRec->getUnderlyingInstr();
1037       if (Instr && Instr->hasPoisonGeneratingFlags())
1038         State.MayGeneratePoisonRecipes.insert(CurRec);
1039 
1040       // Add new definitions to the worklist.
1041       for (VPValue *operand : CurRec->operands())
1042         if (VPDef *OpDef = operand->getDef())
1043           Worklist.push_back(cast<VPRecipeBase>(OpDef));
1044     }
1045   });
1046 
1047   // Traverse all the recipes in the VPlan and collect the poison-generating
1048   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1049   // VPInterleaveRecipe.
1050   auto Iter = depth_first(
1051       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1052   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1053     for (VPRecipeBase &Recipe : *VPBB) {
1054       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1055         Instruction &UnderlyingInstr = WidenRec->getIngredient();
1056         VPDef *AddrDef = WidenRec->getAddr()->getDef();
1057         if (AddrDef && WidenRec->isConsecutive() &&
1058             Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1059           collectPoisonGeneratingInstrsInBackwardSlice(
1060               cast<VPRecipeBase>(AddrDef));
1061       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1062         VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1063         if (AddrDef) {
1064           // Check if any member of the interleave group needs predication.
1065           const InterleaveGroup<Instruction> *InterGroup =
1066               InterleaveRec->getInterleaveGroup();
1067           bool NeedPredication = false;
1068           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1069                I < NumMembers; ++I) {
1070             Instruction *Member = InterGroup->getMember(I);
1071             if (Member)
1072               NeedPredication |=
1073                   Legal->blockNeedsPredication(Member->getParent());
1074           }
1075 
1076           if (NeedPredication)
1077             collectPoisonGeneratingInstrsInBackwardSlice(
1078                 cast<VPRecipeBase>(AddrDef));
1079         }
1080       }
1081     }
1082   }
1083 }
1084 
1085 PHINode *InnerLoopVectorizer::getReductionResumeValue(
1086     const RecurrenceDescriptor &RdxDesc) {
1087   auto It = ReductionResumeValues.find(&RdxDesc);
1088   assert(It != ReductionResumeValues.end() &&
1089          "Expected to find a resume value for the reduction.");
1090   return It->second;
1091 }
1092 
1093 namespace llvm {
1094 
1095 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1096 // lowered.
1097 enum ScalarEpilogueLowering {
1098 
1099   // The default: allowing scalar epilogues.
1100   CM_ScalarEpilogueAllowed,
1101 
1102   // Vectorization with OptForSize: don't allow epilogues.
1103   CM_ScalarEpilogueNotAllowedOptSize,
1104 
1105   // A special case of vectorisation with OptForSize: loops with a very small
1106   // trip count are considered for vectorization under OptForSize, thereby
1107   // making sure the cost of their loop body is dominant, free of runtime
1108   // guards and scalar iteration overheads.
1109   CM_ScalarEpilogueNotAllowedLowTripLoop,
1110 
1111   // Loop hint predicate indicating an epilogue is undesired.
1112   CM_ScalarEpilogueNotNeededUsePredicate,
1113 
1114   // Directive indicating we must either tail fold or not vectorize
1115   CM_ScalarEpilogueNotAllowedUsePredicate
1116 };
1117 
1118 /// ElementCountComparator creates a total ordering for ElementCount
1119 /// for the purposes of using it in a set structure.
1120 struct ElementCountComparator {
1121   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1122     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1123            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1124   }
1125 };
1126 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1127 
1128 /// LoopVectorizationCostModel - estimates the expected speedups due to
1129 /// vectorization.
1130 /// In many cases vectorization is not profitable. This can happen because of
1131 /// a number of reasons. In this class we mainly attempt to predict the
1132 /// expected speedup/slowdowns due to the supported instruction set. We use the
1133 /// TargetTransformInfo to query the different backends for the cost of
1134 /// different operations.
1135 class LoopVectorizationCostModel {
1136 public:
1137   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1138                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1139                              LoopVectorizationLegality *Legal,
1140                              const TargetTransformInfo &TTI,
1141                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1142                              AssumptionCache *AC,
1143                              OptimizationRemarkEmitter *ORE, const Function *F,
1144                              const LoopVectorizeHints *Hints,
1145                              InterleavedAccessInfo &IAI)
1146       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1147         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1148         Hints(Hints), InterleaveInfo(IAI) {}
1149 
1150   /// \return An upper bound for the vectorization factors (both fixed and
1151   /// scalable). If the factors are 0, vectorization and interleaving should be
1152   /// avoided up front.
1153   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1154 
1155   /// \return True if runtime checks are required for vectorization, and false
1156   /// otherwise.
1157   bool runtimeChecksRequired();
1158 
1159   /// \return The most profitable vectorization factor and the cost of that VF.
1160   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1161   /// then this vectorization factor will be selected if vectorization is
1162   /// possible.
1163   VectorizationFactor
1164   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1165 
1166   VectorizationFactor
1167   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1168                                     const LoopVectorizationPlanner &LVP);
1169 
1170   /// Setup cost-based decisions for user vectorization factor.
1171   /// \return true if the UserVF is a feasible VF to be chosen.
1172   bool selectUserVectorizationFactor(ElementCount UserVF) {
1173     collectUniformsAndScalars(UserVF);
1174     collectInstsToScalarize(UserVF);
1175     return expectedCost(UserVF).first.isValid();
1176   }
1177 
1178   /// \return The size (in bits) of the smallest and widest types in the code
1179   /// that needs to be vectorized. We ignore values that remain scalar such as
1180   /// 64 bit loop indices.
1181   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1182 
1183   /// \return The desired interleave count.
1184   /// If interleave count has been specified by metadata it will be returned.
1185   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1186   /// are the selected vectorization factor and the cost of the selected VF.
1187   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1188 
1189   /// Memory access instruction may be vectorized in more than one way.
1190   /// Form of instruction after vectorization depends on cost.
1191   /// This function takes cost-based decisions for Load/Store instructions
1192   /// and collects them in a map. This decisions map is used for building
1193   /// the lists of loop-uniform and loop-scalar instructions.
1194   /// The calculated cost is saved with widening decision in order to
1195   /// avoid redundant calculations.
1196   void setCostBasedWideningDecision(ElementCount VF);
1197 
1198   /// A struct that represents some properties of the register usage
1199   /// of a loop.
1200   struct RegisterUsage {
1201     /// Holds the number of loop invariant values that are used in the loop.
1202     /// The key is ClassID of target-provided register class.
1203     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1204     /// Holds the maximum number of concurrent live intervals in the loop.
1205     /// The key is ClassID of target-provided register class.
1206     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1207   };
1208 
1209   /// \return Returns information about the register usages of the loop for the
1210   /// given vectorization factors.
1211   SmallVector<RegisterUsage, 8>
1212   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1213 
1214   /// Collect values we want to ignore in the cost model.
1215   void collectValuesToIgnore();
1216 
1217   /// Collect all element types in the loop for which widening is needed.
1218   void collectElementTypesForWidening();
1219 
1220   /// Split reductions into those that happen in the loop, and those that happen
1221   /// outside. In loop reductions are collected into InLoopReductionChains.
1222   void collectInLoopReductions();
1223 
1224   /// Returns true if we should use strict in-order reductions for the given
1225   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1226   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1227   /// of FP operations.
1228   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1229     return !Hints->allowReordering() && RdxDesc.isOrdered();
1230   }
1231 
1232   /// \returns The smallest bitwidth each instruction can be represented with.
1233   /// The vector equivalents of these instructions should be truncated to this
1234   /// type.
1235   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1236     return MinBWs;
1237   }
1238 
1239   /// \returns True if it is more profitable to scalarize instruction \p I for
1240   /// vectorization factor \p VF.
1241   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1242     assert(VF.isVector() &&
1243            "Profitable to scalarize relevant only for VF > 1.");
1244 
1245     // Cost model is not run in the VPlan-native path - return conservative
1246     // result until this changes.
1247     if (EnableVPlanNativePath)
1248       return false;
1249 
1250     auto Scalars = InstsToScalarize.find(VF);
1251     assert(Scalars != InstsToScalarize.end() &&
1252            "VF not yet analyzed for scalarization profitability");
1253     return Scalars->second.find(I) != Scalars->second.end();
1254   }
1255 
1256   /// Returns true if \p I is known to be uniform after vectorization.
1257   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1258     if (VF.isScalar())
1259       return true;
1260 
1261     // Cost model is not run in the VPlan-native path - return conservative
1262     // result until this changes.
1263     if (EnableVPlanNativePath)
1264       return false;
1265 
1266     auto UniformsPerVF = Uniforms.find(VF);
1267     assert(UniformsPerVF != Uniforms.end() &&
1268            "VF not yet analyzed for uniformity");
1269     return UniformsPerVF->second.count(I);
1270   }
1271 
1272   /// Returns true if \p I is known to be scalar after vectorization.
1273   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1274     if (VF.isScalar())
1275       return true;
1276 
1277     // Cost model is not run in the VPlan-native path - return conservative
1278     // result until this changes.
1279     if (EnableVPlanNativePath)
1280       return false;
1281 
1282     auto ScalarsPerVF = Scalars.find(VF);
1283     assert(ScalarsPerVF != Scalars.end() &&
1284            "Scalar values are not calculated for VF");
1285     return ScalarsPerVF->second.count(I);
1286   }
1287 
1288   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1289   /// for vectorization factor \p VF.
1290   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1291     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1292            !isProfitableToScalarize(I, VF) &&
1293            !isScalarAfterVectorization(I, VF);
1294   }
1295 
1296   /// Decision that was taken during cost calculation for memory instruction.
1297   enum InstWidening {
1298     CM_Unknown,
1299     CM_Widen,         // For consecutive accesses with stride +1.
1300     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1301     CM_Interleave,
1302     CM_GatherScatter,
1303     CM_Scalarize
1304   };
1305 
1306   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1307   /// instruction \p I and vector width \p VF.
1308   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1309                            InstructionCost Cost) {
1310     assert(VF.isVector() && "Expected VF >=2");
1311     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1312   }
1313 
1314   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1315   /// interleaving group \p Grp and vector width \p VF.
1316   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1317                            ElementCount VF, InstWidening W,
1318                            InstructionCost Cost) {
1319     assert(VF.isVector() && "Expected VF >=2");
1320     /// Broadcast this decicion to all instructions inside the group.
1321     /// But the cost will be assigned to one instruction only.
1322     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1323       if (auto *I = Grp->getMember(i)) {
1324         if (Grp->getInsertPos() == I)
1325           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1326         else
1327           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1328       }
1329     }
1330   }
1331 
1332   /// Return the cost model decision for the given instruction \p I and vector
1333   /// width \p VF. Return CM_Unknown if this instruction did not pass
1334   /// through the cost modeling.
1335   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1336     assert(VF.isVector() && "Expected VF to be a vector VF");
1337     // Cost model is not run in the VPlan-native path - return conservative
1338     // result until this changes.
1339     if (EnableVPlanNativePath)
1340       return CM_GatherScatter;
1341 
1342     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1343     auto Itr = WideningDecisions.find(InstOnVF);
1344     if (Itr == WideningDecisions.end())
1345       return CM_Unknown;
1346     return Itr->second.first;
1347   }
1348 
1349   /// Return the vectorization cost for the given instruction \p I and vector
1350   /// width \p VF.
1351   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1352     assert(VF.isVector() && "Expected VF >=2");
1353     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1354     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1355            "The cost is not calculated");
1356     return WideningDecisions[InstOnVF].second;
1357   }
1358 
1359   /// Return True if instruction \p I is an optimizable truncate whose operand
1360   /// is an induction variable. Such a truncate will be removed by adding a new
1361   /// induction variable with the destination type.
1362   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1363     // If the instruction is not a truncate, return false.
1364     auto *Trunc = dyn_cast<TruncInst>(I);
1365     if (!Trunc)
1366       return false;
1367 
1368     // Get the source and destination types of the truncate.
1369     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1370     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1371 
1372     // If the truncate is free for the given types, return false. Replacing a
1373     // free truncate with an induction variable would add an induction variable
1374     // update instruction to each iteration of the loop. We exclude from this
1375     // check the primary induction variable since it will need an update
1376     // instruction regardless.
1377     Value *Op = Trunc->getOperand(0);
1378     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1379       return false;
1380 
1381     // If the truncated value is not an induction variable, return false.
1382     return Legal->isInductionPhi(Op);
1383   }
1384 
1385   /// Collects the instructions to scalarize for each predicated instruction in
1386   /// the loop.
1387   void collectInstsToScalarize(ElementCount VF);
1388 
1389   /// Collect Uniform and Scalar values for the given \p VF.
1390   /// The sets depend on CM decision for Load/Store instructions
1391   /// that may be vectorized as interleave, gather-scatter or scalarized.
1392   void collectUniformsAndScalars(ElementCount VF) {
1393     // Do the analysis once.
1394     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1395       return;
1396     setCostBasedWideningDecision(VF);
1397     collectLoopUniforms(VF);
1398     collectLoopScalars(VF);
1399   }
1400 
1401   /// Returns true if the target machine supports masked store operation
1402   /// for the given \p DataType and kind of access to \p Ptr.
1403   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1404     return Legal->isConsecutivePtr(DataType, Ptr) &&
1405            TTI.isLegalMaskedStore(DataType, Alignment);
1406   }
1407 
1408   /// Returns true if the target machine supports masked load operation
1409   /// for the given \p DataType and kind of access to \p Ptr.
1410   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1411     return Legal->isConsecutivePtr(DataType, Ptr) &&
1412            TTI.isLegalMaskedLoad(DataType, Alignment);
1413   }
1414 
1415   /// Returns true if the target machine can represent \p V as a masked gather
1416   /// or scatter operation.
1417   bool isLegalGatherOrScatter(Value *V,
1418                               ElementCount VF = ElementCount::getFixed(1)) {
1419     bool LI = isa<LoadInst>(V);
1420     bool SI = isa<StoreInst>(V);
1421     if (!LI && !SI)
1422       return false;
1423     auto *Ty = getLoadStoreType(V);
1424     Align Align = getLoadStoreAlignment(V);
1425     if (VF.isVector())
1426       Ty = VectorType::get(Ty, VF);
1427     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1428            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1429   }
1430 
1431   /// Returns true if the target machine supports all of the reduction
1432   /// variables found for the given VF.
1433   bool canVectorizeReductions(ElementCount VF) const {
1434     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1435       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1436       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1437     }));
1438   }
1439 
1440   /// Returns true if \p I is an instruction that will be scalarized with
1441   /// predication when vectorizing \p I with vectorization factor \p VF. Such
1442   /// instructions include conditional stores and instructions that may divide
1443   /// by zero.
1444   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1445 
1446   // Returns true if \p I is an instruction that will be predicated either
1447   // through scalar predication or masked load/store or masked gather/scatter.
1448   // \p VF is the vectorization factor that will be used to vectorize \p I.
1449   // Superset of instructions that return true for isScalarWithPredication.
1450   bool isPredicatedInst(Instruction *I, ElementCount VF,
1451                         bool IsKnownUniform = false) {
1452     // When we know the load is uniform and the original scalar loop was not
1453     // predicated we don't need to mark it as a predicated instruction. Any
1454     // vectorised blocks created when tail-folding are something artificial we
1455     // have introduced and we know there is always at least one active lane.
1456     // That's why we call Legal->blockNeedsPredication here because it doesn't
1457     // query tail-folding.
1458     if (IsKnownUniform && isa<LoadInst>(I) &&
1459         !Legal->blockNeedsPredication(I->getParent()))
1460       return false;
1461     if (!blockNeedsPredicationForAnyReason(I->getParent()))
1462       return false;
1463     // Loads and stores that need some form of masked operation are predicated
1464     // instructions.
1465     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1466       return Legal->isMaskRequired(I);
1467     return isScalarWithPredication(I, VF);
1468   }
1469 
1470   /// Returns true if \p I is a memory instruction with consecutive memory
1471   /// access that can be widened.
1472   bool
1473   memoryInstructionCanBeWidened(Instruction *I,
1474                                 ElementCount VF = ElementCount::getFixed(1));
1475 
1476   /// Returns true if \p I is a memory instruction in an interleaved-group
1477   /// of memory accesses that can be vectorized with wide vector loads/stores
1478   /// and shuffles.
1479   bool
1480   interleavedAccessCanBeWidened(Instruction *I,
1481                                 ElementCount VF = ElementCount::getFixed(1));
1482 
1483   /// Check if \p Instr belongs to any interleaved access group.
1484   bool isAccessInterleaved(Instruction *Instr) {
1485     return InterleaveInfo.isInterleaved(Instr);
1486   }
1487 
1488   /// Get the interleaved access group that \p Instr belongs to.
1489   const InterleaveGroup<Instruction> *
1490   getInterleavedAccessGroup(Instruction *Instr) {
1491     return InterleaveInfo.getInterleaveGroup(Instr);
1492   }
1493 
1494   /// Returns true if we're required to use a scalar epilogue for at least
1495   /// the final iteration of the original loop.
1496   bool requiresScalarEpilogue(ElementCount VF) const {
1497     if (!isScalarEpilogueAllowed())
1498       return false;
1499     // If we might exit from anywhere but the latch, must run the exiting
1500     // iteration in scalar form.
1501     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1502       return true;
1503     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1504   }
1505 
1506   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1507   /// loop hint annotation.
1508   bool isScalarEpilogueAllowed() const {
1509     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1510   }
1511 
1512   /// Returns true if all loop blocks should be masked to fold tail loop.
1513   bool foldTailByMasking() const { return FoldTailByMasking; }
1514 
1515   /// Returns true if were tail-folding and want to use the active lane mask
1516   /// for vector loop control flow.
1517   bool useActiveLaneMaskForControlFlow() const {
1518     return FoldTailByMasking &&
1519            TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow;
1520   }
1521 
1522   /// Returns true if the instructions in this block requires predication
1523   /// for any reason, e.g. because tail folding now requires a predicate
1524   /// or because the block in the original loop was predicated.
1525   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1526     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1527   }
1528 
1529   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1530   /// nodes to the chain of instructions representing the reductions. Uses a
1531   /// MapVector to ensure deterministic iteration order.
1532   using ReductionChainMap =
1533       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1534 
1535   /// Return the chain of instructions representing an inloop reduction.
1536   const ReductionChainMap &getInLoopReductionChains() const {
1537     return InLoopReductionChains;
1538   }
1539 
1540   /// Returns true if the Phi is part of an inloop reduction.
1541   bool isInLoopReduction(PHINode *Phi) const {
1542     return InLoopReductionChains.count(Phi);
1543   }
1544 
1545   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1546   /// with factor VF.  Return the cost of the instruction, including
1547   /// scalarization overhead if it's needed.
1548   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1549 
1550   /// Estimate cost of a call instruction CI if it were vectorized with factor
1551   /// VF. Return the cost of the instruction, including scalarization overhead
1552   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1553   /// scalarized -
1554   /// i.e. either vector version isn't available, or is too expensive.
1555   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1556                                     bool &NeedToScalarize) const;
1557 
1558   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1559   /// that of B.
1560   bool isMoreProfitable(const VectorizationFactor &A,
1561                         const VectorizationFactor &B) const;
1562 
1563   /// Invalidates decisions already taken by the cost model.
1564   void invalidateCostModelingDecisions() {
1565     WideningDecisions.clear();
1566     Uniforms.clear();
1567     Scalars.clear();
1568   }
1569 
1570   /// Convenience function that returns the value of vscale_range iff
1571   /// vscale_range.min == vscale_range.max or otherwise returns the value
1572   /// returned by the corresponding TLI method.
1573   Optional<unsigned> getVScaleForTuning() const;
1574 
1575 private:
1576   unsigned NumPredStores = 0;
1577 
1578   /// \return An upper bound for the vectorization factors for both
1579   /// fixed and scalable vectorization, where the minimum-known number of
1580   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1581   /// disabled or unsupported, then the scalable part will be equal to
1582   /// ElementCount::getScalable(0).
1583   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1584                                            ElementCount UserVF,
1585                                            bool FoldTailByMasking);
1586 
1587   /// \return the maximized element count based on the targets vector
1588   /// registers and the loop trip-count, but limited to a maximum safe VF.
1589   /// This is a helper function of computeFeasibleMaxVF.
1590   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1591                                        unsigned SmallestType,
1592                                        unsigned WidestType,
1593                                        ElementCount MaxSafeVF,
1594                                        bool FoldTailByMasking);
1595 
1596   /// \return the maximum legal scalable VF, based on the safe max number
1597   /// of elements.
1598   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1599 
1600   /// The vectorization cost is a combination of the cost itself and a boolean
1601   /// indicating whether any of the contributing operations will actually
1602   /// operate on vector values after type legalization in the backend. If this
1603   /// latter value is false, then all operations will be scalarized (i.e. no
1604   /// vectorization has actually taken place).
1605   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1606 
1607   /// Returns the expected execution cost. The unit of the cost does
1608   /// not matter because we use the 'cost' units to compare different
1609   /// vector widths. The cost that is returned is *not* normalized by
1610   /// the factor width. If \p Invalid is not nullptr, this function
1611   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1612   /// each instruction that has an Invalid cost for the given VF.
1613   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1614   VectorizationCostTy
1615   expectedCost(ElementCount VF,
1616                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1617 
1618   /// Returns the execution time cost of an instruction for a given vector
1619   /// width. Vector width of one means scalar.
1620   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1621 
1622   /// The cost-computation logic from getInstructionCost which provides
1623   /// the vector type as an output parameter.
1624   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1625                                      Type *&VectorTy);
1626 
1627   /// Return the cost of instructions in an inloop reduction pattern, if I is
1628   /// part of that pattern.
1629   Optional<InstructionCost>
1630   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1631                           TTI::TargetCostKind CostKind);
1632 
1633   /// Calculate vectorization cost of memory instruction \p I.
1634   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1635 
1636   /// The cost computation for scalarized memory instruction.
1637   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1638 
1639   /// The cost computation for interleaving group of memory instructions.
1640   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1641 
1642   /// The cost computation for Gather/Scatter instruction.
1643   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1644 
1645   /// The cost computation for widening instruction \p I with consecutive
1646   /// memory access.
1647   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1648 
1649   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1650   /// Load: scalar load + broadcast.
1651   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1652   /// element)
1653   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1654 
1655   /// Estimate the overhead of scalarizing an instruction. This is a
1656   /// convenience wrapper for the type-based getScalarizationOverhead API.
1657   InstructionCost getScalarizationOverhead(Instruction *I,
1658                                            ElementCount VF) const;
1659 
1660   /// Returns whether the instruction is a load or store and will be a emitted
1661   /// as a vector operation.
1662   bool isConsecutiveLoadOrStore(Instruction *I);
1663 
1664   /// Returns true if an artificially high cost for emulated masked memrefs
1665   /// should be used.
1666   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1667 
1668   /// Map of scalar integer values to the smallest bitwidth they can be legally
1669   /// represented as. The vector equivalents of these values should be truncated
1670   /// to this type.
1671   MapVector<Instruction *, uint64_t> MinBWs;
1672 
1673   /// A type representing the costs for instructions if they were to be
1674   /// scalarized rather than vectorized. The entries are Instruction-Cost
1675   /// pairs.
1676   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1677 
1678   /// A set containing all BasicBlocks that are known to present after
1679   /// vectorization as a predicated block.
1680   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1681       PredicatedBBsAfterVectorization;
1682 
1683   /// Records whether it is allowed to have the original scalar loop execute at
1684   /// least once. This may be needed as a fallback loop in case runtime
1685   /// aliasing/dependence checks fail, or to handle the tail/remainder
1686   /// iterations when the trip count is unknown or doesn't divide by the VF,
1687   /// or as a peel-loop to handle gaps in interleave-groups.
1688   /// Under optsize and when the trip count is very small we don't allow any
1689   /// iterations to execute in the scalar loop.
1690   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1691 
1692   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1693   bool FoldTailByMasking = false;
1694 
1695   /// A map holding scalar costs for different vectorization factors. The
1696   /// presence of a cost for an instruction in the mapping indicates that the
1697   /// instruction will be scalarized when vectorizing with the associated
1698   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1699   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1700 
1701   /// Holds the instructions known to be uniform after vectorization.
1702   /// The data is collected per VF.
1703   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1704 
1705   /// Holds the instructions known to be scalar after vectorization.
1706   /// The data is collected per VF.
1707   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1708 
1709   /// Holds the instructions (address computations) that are forced to be
1710   /// scalarized.
1711   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1712 
1713   /// PHINodes of the reductions that should be expanded in-loop along with
1714   /// their associated chains of reduction operations, in program order from top
1715   /// (PHI) to bottom
1716   ReductionChainMap InLoopReductionChains;
1717 
1718   /// A Map of inloop reduction operations and their immediate chain operand.
1719   /// FIXME: This can be removed once reductions can be costed correctly in
1720   /// vplan. This was added to allow quick lookup to the inloop operations,
1721   /// without having to loop through InLoopReductionChains.
1722   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1723 
1724   /// Returns the expected difference in cost from scalarizing the expression
1725   /// feeding a predicated instruction \p PredInst. The instructions to
1726   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1727   /// non-negative return value implies the expression will be scalarized.
1728   /// Currently, only single-use chains are considered for scalarization.
1729   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1730                               ElementCount VF);
1731 
1732   /// Collect the instructions that are uniform after vectorization. An
1733   /// instruction is uniform if we represent it with a single scalar value in
1734   /// the vectorized loop corresponding to each vector iteration. Examples of
1735   /// uniform instructions include pointer operands of consecutive or
1736   /// interleaved memory accesses. Note that although uniformity implies an
1737   /// instruction will be scalar, the reverse is not true. In general, a
1738   /// scalarized instruction will be represented by VF scalar values in the
1739   /// vectorized loop, each corresponding to an iteration of the original
1740   /// scalar loop.
1741   void collectLoopUniforms(ElementCount VF);
1742 
1743   /// Collect the instructions that are scalar after vectorization. An
1744   /// instruction is scalar if it is known to be uniform or will be scalarized
1745   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1746   /// to the list if they are used by a load/store instruction that is marked as
1747   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1748   /// VF values in the vectorized loop, each corresponding to an iteration of
1749   /// the original scalar loop.
1750   void collectLoopScalars(ElementCount VF);
1751 
1752   /// Keeps cost model vectorization decision and cost for instructions.
1753   /// Right now it is used for memory instructions only.
1754   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1755                                 std::pair<InstWidening, InstructionCost>>;
1756 
1757   DecisionList WideningDecisions;
1758 
1759   /// Returns true if \p V is expected to be vectorized and it needs to be
1760   /// extracted.
1761   bool needsExtract(Value *V, ElementCount VF) const {
1762     Instruction *I = dyn_cast<Instruction>(V);
1763     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1764         TheLoop->isLoopInvariant(I))
1765       return false;
1766 
1767     // Assume we can vectorize V (and hence we need extraction) if the
1768     // scalars are not computed yet. This can happen, because it is called
1769     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1770     // the scalars are collected. That should be a safe assumption in most
1771     // cases, because we check if the operands have vectorizable types
1772     // beforehand in LoopVectorizationLegality.
1773     return Scalars.find(VF) == Scalars.end() ||
1774            !isScalarAfterVectorization(I, VF);
1775   };
1776 
1777   /// Returns a range containing only operands needing to be extracted.
1778   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1779                                                    ElementCount VF) const {
1780     return SmallVector<Value *, 4>(make_filter_range(
1781         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1782   }
1783 
1784   /// Determines if we have the infrastructure to vectorize loop \p L and its
1785   /// epilogue, assuming the main loop is vectorized by \p VF.
1786   bool isCandidateForEpilogueVectorization(const Loop &L,
1787                                            const ElementCount VF) const;
1788 
1789   /// Returns true if epilogue vectorization is considered profitable, and
1790   /// false otherwise.
1791   /// \p VF is the vectorization factor chosen for the original loop.
1792   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1793 
1794 public:
1795   /// The loop that we evaluate.
1796   Loop *TheLoop;
1797 
1798   /// Predicated scalar evolution analysis.
1799   PredicatedScalarEvolution &PSE;
1800 
1801   /// Loop Info analysis.
1802   LoopInfo *LI;
1803 
1804   /// Vectorization legality.
1805   LoopVectorizationLegality *Legal;
1806 
1807   /// Vector target information.
1808   const TargetTransformInfo &TTI;
1809 
1810   /// Target Library Info.
1811   const TargetLibraryInfo *TLI;
1812 
1813   /// Demanded bits analysis.
1814   DemandedBits *DB;
1815 
1816   /// Assumption cache.
1817   AssumptionCache *AC;
1818 
1819   /// Interface to emit optimization remarks.
1820   OptimizationRemarkEmitter *ORE;
1821 
1822   const Function *TheFunction;
1823 
1824   /// Loop Vectorize Hint.
1825   const LoopVectorizeHints *Hints;
1826 
1827   /// The interleave access information contains groups of interleaved accesses
1828   /// with the same stride and close to each other.
1829   InterleavedAccessInfo &InterleaveInfo;
1830 
1831   /// Values to ignore in the cost model.
1832   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1833 
1834   /// Values to ignore in the cost model when VF > 1.
1835   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1836 
1837   /// All element types found in the loop.
1838   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1839 
1840   /// Profitable vector factors.
1841   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1842 };
1843 } // end namespace llvm
1844 
1845 /// Helper struct to manage generating runtime checks for vectorization.
1846 ///
1847 /// The runtime checks are created up-front in temporary blocks to allow better
1848 /// estimating the cost and un-linked from the existing IR. After deciding to
1849 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1850 /// temporary blocks are completely removed.
1851 class GeneratedRTChecks {
1852   /// Basic block which contains the generated SCEV checks, if any.
1853   BasicBlock *SCEVCheckBlock = nullptr;
1854 
1855   /// The value representing the result of the generated SCEV checks. If it is
1856   /// nullptr, either no SCEV checks have been generated or they have been used.
1857   Value *SCEVCheckCond = nullptr;
1858 
1859   /// Basic block which contains the generated memory runtime checks, if any.
1860   BasicBlock *MemCheckBlock = nullptr;
1861 
1862   /// The value representing the result of the generated memory runtime checks.
1863   /// If it is nullptr, either no memory runtime checks have been generated or
1864   /// they have been used.
1865   Value *MemRuntimeCheckCond = nullptr;
1866 
1867   DominatorTree *DT;
1868   LoopInfo *LI;
1869   TargetTransformInfo *TTI;
1870 
1871   SCEVExpander SCEVExp;
1872   SCEVExpander MemCheckExp;
1873 
1874   bool CostTooHigh = false;
1875 
1876 public:
1877   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1878                     TargetTransformInfo *TTI, const DataLayout &DL)
1879       : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1880         MemCheckExp(SE, DL, "scev.check") {}
1881 
1882   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1883   /// accurately estimate the cost of the runtime checks. The blocks are
1884   /// un-linked from the IR and is added back during vector code generation. If
1885   /// there is no vector code generation, the check blocks are removed
1886   /// completely.
1887   void Create(Loop *L, const LoopAccessInfo &LAI,
1888               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1889 
1890     // Hard cutoff to limit compile-time increase in case a very large number of
1891     // runtime checks needs to be generated.
1892     // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1893     // profile info.
1894     CostTooHigh =
1895         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1896     if (CostTooHigh)
1897       return;
1898 
1899     BasicBlock *LoopHeader = L->getHeader();
1900     BasicBlock *Preheader = L->getLoopPreheader();
1901 
1902     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1903     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1904     // may be used by SCEVExpander. The blocks will be un-linked from their
1905     // predecessors and removed from LI & DT at the end of the function.
1906     if (!UnionPred.isAlwaysTrue()) {
1907       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1908                                   nullptr, "vector.scevcheck");
1909 
1910       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1911           &UnionPred, SCEVCheckBlock->getTerminator());
1912     }
1913 
1914     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1915     if (RtPtrChecking.Need) {
1916       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1917       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1918                                  "vector.memcheck");
1919 
1920       auto DiffChecks = RtPtrChecking.getDiffChecks();
1921       if (DiffChecks) {
1922         MemRuntimeCheckCond = addDiffRuntimeChecks(
1923             MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp,
1924             [VF](IRBuilderBase &B, unsigned Bits) {
1925               return getRuntimeVF(B, B.getIntNTy(Bits), VF);
1926             },
1927             IC);
1928       } else {
1929         MemRuntimeCheckCond =
1930             addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1931                              RtPtrChecking.getChecks(), MemCheckExp);
1932       }
1933       assert(MemRuntimeCheckCond &&
1934              "no RT checks generated although RtPtrChecking "
1935              "claimed checks are required");
1936     }
1937 
1938     if (!MemCheckBlock && !SCEVCheckBlock)
1939       return;
1940 
1941     // Unhook the temporary block with the checks, update various places
1942     // accordingly.
1943     if (SCEVCheckBlock)
1944       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1945     if (MemCheckBlock)
1946       MemCheckBlock->replaceAllUsesWith(Preheader);
1947 
1948     if (SCEVCheckBlock) {
1949       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1950       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1951       Preheader->getTerminator()->eraseFromParent();
1952     }
1953     if (MemCheckBlock) {
1954       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1955       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1956       Preheader->getTerminator()->eraseFromParent();
1957     }
1958 
1959     DT->changeImmediateDominator(LoopHeader, Preheader);
1960     if (MemCheckBlock) {
1961       DT->eraseNode(MemCheckBlock);
1962       LI->removeBlock(MemCheckBlock);
1963     }
1964     if (SCEVCheckBlock) {
1965       DT->eraseNode(SCEVCheckBlock);
1966       LI->removeBlock(SCEVCheckBlock);
1967     }
1968   }
1969 
1970   InstructionCost getCost() {
1971     if (SCEVCheckBlock || MemCheckBlock)
1972       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1973 
1974     if (CostTooHigh) {
1975       InstructionCost Cost;
1976       Cost.setInvalid();
1977       LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
1978       return Cost;
1979     }
1980 
1981     InstructionCost RTCheckCost = 0;
1982     if (SCEVCheckBlock)
1983       for (Instruction &I : *SCEVCheckBlock) {
1984         if (SCEVCheckBlock->getTerminator() == &I)
1985           continue;
1986         InstructionCost C =
1987             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
1988         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1989         RTCheckCost += C;
1990       }
1991     if (MemCheckBlock)
1992       for (Instruction &I : *MemCheckBlock) {
1993         if (MemCheckBlock->getTerminator() == &I)
1994           continue;
1995         InstructionCost C =
1996             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
1997         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1998         RTCheckCost += C;
1999       }
2000 
2001     if (SCEVCheckBlock || MemCheckBlock)
2002       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2003                         << "\n");
2004 
2005     return RTCheckCost;
2006   }
2007 
2008   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2009   /// unused.
2010   ~GeneratedRTChecks() {
2011     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2012     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2013     if (!SCEVCheckCond)
2014       SCEVCleaner.markResultUsed();
2015 
2016     if (!MemRuntimeCheckCond)
2017       MemCheckCleaner.markResultUsed();
2018 
2019     if (MemRuntimeCheckCond) {
2020       auto &SE = *MemCheckExp.getSE();
2021       // Memory runtime check generation creates compares that use expanded
2022       // values. Remove them before running the SCEVExpanderCleaners.
2023       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2024         if (MemCheckExp.isInsertedInstruction(&I))
2025           continue;
2026         SE.forgetValue(&I);
2027         I.eraseFromParent();
2028       }
2029     }
2030     MemCheckCleaner.cleanup();
2031     SCEVCleaner.cleanup();
2032 
2033     if (SCEVCheckCond)
2034       SCEVCheckBlock->eraseFromParent();
2035     if (MemRuntimeCheckCond)
2036       MemCheckBlock->eraseFromParent();
2037   }
2038 
2039   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2040   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2041   /// depending on the generated condition.
2042   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2043                              BasicBlock *LoopVectorPreHeader,
2044                              BasicBlock *LoopExitBlock) {
2045     if (!SCEVCheckCond)
2046       return nullptr;
2047 
2048     Value *Cond = SCEVCheckCond;
2049     // Mark the check as used, to prevent it from being removed during cleanup.
2050     SCEVCheckCond = nullptr;
2051     if (auto *C = dyn_cast<ConstantInt>(Cond))
2052       if (C->isZero())
2053         return nullptr;
2054 
2055     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2056 
2057     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2058     // Create new preheader for vector loop.
2059     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2060       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2061 
2062     SCEVCheckBlock->getTerminator()->eraseFromParent();
2063     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2064     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2065                                                 SCEVCheckBlock);
2066 
2067     DT->addNewBlock(SCEVCheckBlock, Pred);
2068     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2069 
2070     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
2071                         BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
2072     return SCEVCheckBlock;
2073   }
2074 
2075   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2076   /// the branches to branch to the vector preheader or \p Bypass, depending on
2077   /// the generated condition.
2078   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2079                                    BasicBlock *LoopVectorPreHeader) {
2080     // Check if we generated code that checks in runtime if arrays overlap.
2081     if (!MemRuntimeCheckCond)
2082       return nullptr;
2083 
2084     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2085     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2086                                                 MemCheckBlock);
2087 
2088     DT->addNewBlock(MemCheckBlock, Pred);
2089     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2090     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2091 
2092     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2093       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2094 
2095     ReplaceInstWithInst(
2096         MemCheckBlock->getTerminator(),
2097         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2098     MemCheckBlock->getTerminator()->setDebugLoc(
2099         Pred->getTerminator()->getDebugLoc());
2100 
2101     // Mark the check as used, to prevent it from being removed during cleanup.
2102     MemRuntimeCheckCond = nullptr;
2103     return MemCheckBlock;
2104   }
2105 };
2106 
2107 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2108 // vectorization. The loop needs to be annotated with #pragma omp simd
2109 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2110 // vector length information is not provided, vectorization is not considered
2111 // explicit. Interleave hints are not allowed either. These limitations will be
2112 // relaxed in the future.
2113 // Please, note that we are currently forced to abuse the pragma 'clang
2114 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2115 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2116 // provides *explicit vectorization hints* (LV can bypass legal checks and
2117 // assume that vectorization is legal). However, both hints are implemented
2118 // using the same metadata (llvm.loop.vectorize, processed by
2119 // LoopVectorizeHints). This will be fixed in the future when the native IR
2120 // representation for pragma 'omp simd' is introduced.
2121 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2122                                    OptimizationRemarkEmitter *ORE) {
2123   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2124   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2125 
2126   // Only outer loops with an explicit vectorization hint are supported.
2127   // Unannotated outer loops are ignored.
2128   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2129     return false;
2130 
2131   Function *Fn = OuterLp->getHeader()->getParent();
2132   if (!Hints.allowVectorization(Fn, OuterLp,
2133                                 true /*VectorizeOnlyWhenForced*/)) {
2134     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2135     return false;
2136   }
2137 
2138   if (Hints.getInterleave() > 1) {
2139     // TODO: Interleave support is future work.
2140     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2141                          "outer loops.\n");
2142     Hints.emitRemarkWithHints();
2143     return false;
2144   }
2145 
2146   return true;
2147 }
2148 
2149 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2150                                   OptimizationRemarkEmitter *ORE,
2151                                   SmallVectorImpl<Loop *> &V) {
2152   // Collect inner loops and outer loops without irreducible control flow. For
2153   // now, only collect outer loops that have explicit vectorization hints. If we
2154   // are stress testing the VPlan H-CFG construction, we collect the outermost
2155   // loop of every loop nest.
2156   if (L.isInnermost() || VPlanBuildStressTest ||
2157       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2158     LoopBlocksRPO RPOT(&L);
2159     RPOT.perform(LI);
2160     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2161       V.push_back(&L);
2162       // TODO: Collect inner loops inside marked outer loops in case
2163       // vectorization fails for the outer loop. Do not invoke
2164       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2165       // already known to be reducible. We can use an inherited attribute for
2166       // that.
2167       return;
2168     }
2169   }
2170   for (Loop *InnerL : L)
2171     collectSupportedLoops(*InnerL, LI, ORE, V);
2172 }
2173 
2174 namespace {
2175 
2176 /// The LoopVectorize Pass.
2177 struct LoopVectorize : public FunctionPass {
2178   /// Pass identification, replacement for typeid
2179   static char ID;
2180 
2181   LoopVectorizePass Impl;
2182 
2183   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2184                          bool VectorizeOnlyWhenForced = false)
2185       : FunctionPass(ID),
2186         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2187     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2188   }
2189 
2190   bool runOnFunction(Function &F) override {
2191     if (skipFunction(F))
2192       return false;
2193 
2194     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2195     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2196     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2197     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2198     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2199     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2200     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2201     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2202     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2203     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2204     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2205     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2206     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2207 
2208     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2209         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2210 
2211     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2212                         GetLAA, *ORE, PSI).MadeAnyChange;
2213   }
2214 
2215   void getAnalysisUsage(AnalysisUsage &AU) const override {
2216     AU.addRequired<AssumptionCacheTracker>();
2217     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2218     AU.addRequired<DominatorTreeWrapperPass>();
2219     AU.addRequired<LoopInfoWrapperPass>();
2220     AU.addRequired<ScalarEvolutionWrapperPass>();
2221     AU.addRequired<TargetTransformInfoWrapperPass>();
2222     AU.addRequired<AAResultsWrapperPass>();
2223     AU.addRequired<LoopAccessLegacyAnalysis>();
2224     AU.addRequired<DemandedBitsWrapperPass>();
2225     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2226     AU.addRequired<InjectTLIMappingsLegacy>();
2227 
2228     // We currently do not preserve loopinfo/dominator analyses with outer loop
2229     // vectorization. Until this is addressed, mark these analyses as preserved
2230     // only for non-VPlan-native path.
2231     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2232     if (!EnableVPlanNativePath) {
2233       AU.addPreserved<LoopInfoWrapperPass>();
2234       AU.addPreserved<DominatorTreeWrapperPass>();
2235     }
2236 
2237     AU.addPreserved<BasicAAWrapperPass>();
2238     AU.addPreserved<GlobalsAAWrapperPass>();
2239     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2240   }
2241 };
2242 
2243 } // end anonymous namespace
2244 
2245 //===----------------------------------------------------------------------===//
2246 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2247 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2248 //===----------------------------------------------------------------------===//
2249 
2250 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2251   // We need to place the broadcast of invariant variables outside the loop,
2252   // but only if it's proven safe to do so. Else, broadcast will be inside
2253   // vector loop body.
2254   Instruction *Instr = dyn_cast<Instruction>(V);
2255   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2256                      (!Instr ||
2257                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2258   // Place the code for broadcasting invariant variables in the new preheader.
2259   IRBuilder<>::InsertPointGuard Guard(Builder);
2260   if (SafeToHoist)
2261     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2262 
2263   // Broadcast the scalar into all locations in the vector.
2264   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2265 
2266   return Shuf;
2267 }
2268 
2269 /// This function adds
2270 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2271 /// to each vector element of Val. The sequence starts at StartIndex.
2272 /// \p Opcode is relevant for FP induction variable.
2273 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2274                             Instruction::BinaryOps BinOp, ElementCount VF,
2275                             IRBuilderBase &Builder) {
2276   assert(VF.isVector() && "only vector VFs are supported");
2277 
2278   // Create and check the types.
2279   auto *ValVTy = cast<VectorType>(Val->getType());
2280   ElementCount VLen = ValVTy->getElementCount();
2281 
2282   Type *STy = Val->getType()->getScalarType();
2283   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2284          "Induction Step must be an integer or FP");
2285   assert(Step->getType() == STy && "Step has wrong type");
2286 
2287   SmallVector<Constant *, 8> Indices;
2288 
2289   // Create a vector of consecutive numbers from zero to VF.
2290   VectorType *InitVecValVTy = ValVTy;
2291   if (STy->isFloatingPointTy()) {
2292     Type *InitVecValSTy =
2293         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2294     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2295   }
2296   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2297 
2298   // Splat the StartIdx
2299   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2300 
2301   if (STy->isIntegerTy()) {
2302     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2303     Step = Builder.CreateVectorSplat(VLen, Step);
2304     assert(Step->getType() == Val->getType() && "Invalid step vec");
2305     // FIXME: The newly created binary instructions should contain nsw/nuw
2306     // flags, which can be found from the original scalar operations.
2307     Step = Builder.CreateMul(InitVec, Step);
2308     return Builder.CreateAdd(Val, Step, "induction");
2309   }
2310 
2311   // Floating point induction.
2312   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2313          "Binary Opcode should be specified for FP induction");
2314   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2315   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2316 
2317   Step = Builder.CreateVectorSplat(VLen, Step);
2318   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2319   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2320 }
2321 
2322 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2323 /// variable on which to base the steps, \p Step is the size of the step.
2324 static void buildScalarSteps(Value *ScalarIV, Value *Step,
2325                              const InductionDescriptor &ID, VPValue *Def,
2326                              VPTransformState &State) {
2327   IRBuilderBase &Builder = State.Builder;
2328   // We shouldn't have to build scalar steps if we aren't vectorizing.
2329   assert(State.VF.isVector() && "VF should be greater than one");
2330   // Get the value type and ensure it and the step have the same integer type.
2331   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2332   assert(ScalarIVTy == Step->getType() &&
2333          "Val and Step should have the same type");
2334 
2335   // We build scalar steps for both integer and floating-point induction
2336   // variables. Here, we determine the kind of arithmetic we will perform.
2337   Instruction::BinaryOps AddOp;
2338   Instruction::BinaryOps MulOp;
2339   if (ScalarIVTy->isIntegerTy()) {
2340     AddOp = Instruction::Add;
2341     MulOp = Instruction::Mul;
2342   } else {
2343     AddOp = ID.getInductionOpcode();
2344     MulOp = Instruction::FMul;
2345   }
2346 
2347   // Determine the number of scalars we need to generate for each unroll
2348   // iteration.
2349   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2350   unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2351   // Compute the scalar steps and save the results in State.
2352   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2353                                      ScalarIVTy->getScalarSizeInBits());
2354   Type *VecIVTy = nullptr;
2355   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2356   if (!FirstLaneOnly && State.VF.isScalable()) {
2357     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2358     UnitStepVec =
2359         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2360     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2361     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2362   }
2363 
2364   for (unsigned Part = 0; Part < State.UF; ++Part) {
2365     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2366 
2367     if (!FirstLaneOnly && State.VF.isScalable()) {
2368       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2369       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2370       if (ScalarIVTy->isFloatingPointTy())
2371         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2372       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2373       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2374       State.set(Def, Add, Part);
2375       // It's useful to record the lane values too for the known minimum number
2376       // of elements so we do those below. This improves the code quality when
2377       // trying to extract the first element, for example.
2378     }
2379 
2380     if (ScalarIVTy->isFloatingPointTy())
2381       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2382 
2383     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2384       Value *StartIdx = Builder.CreateBinOp(
2385           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2386       // The step returned by `createStepForVF` is a runtime-evaluated value
2387       // when VF is scalable. Otherwise, it should be folded into a Constant.
2388       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2389              "Expected StartIdx to be folded to a constant when VF is not "
2390              "scalable");
2391       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2392       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2393       State.set(Def, Add, VPIteration(Part, Lane));
2394     }
2395   }
2396 }
2397 
2398 // Generate code for the induction step. Note that induction steps are
2399 // required to be loop-invariant
2400 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2401                               Instruction *InsertBefore,
2402                               Loop *OrigLoop = nullptr) {
2403   const DataLayout &DL = SE.getDataLayout();
2404   assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
2405          "Induction step should be loop invariant");
2406   if (auto *E = dyn_cast<SCEVUnknown>(Step))
2407     return E->getValue();
2408 
2409   SCEVExpander Exp(SE, DL, "induction");
2410   return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2411 }
2412 
2413 /// Compute the transformed value of Index at offset StartValue using step
2414 /// StepValue.
2415 /// For integer induction, returns StartValue + Index * StepValue.
2416 /// For pointer induction, returns StartValue[Index * StepValue].
2417 /// FIXME: The newly created binary instructions should contain nsw/nuw
2418 /// flags, which can be found from the original scalar operations.
2419 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2420                                    Value *StartValue, Value *Step,
2421                                    const InductionDescriptor &ID) {
2422   assert(Index->getType()->getScalarType() == Step->getType() &&
2423          "Index scalar type does not match StepValue type");
2424 
2425   // Note: the IR at this point is broken. We cannot use SE to create any new
2426   // SCEV and then expand it, hoping that SCEV's simplification will give us
2427   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2428   // lead to various SCEV crashes. So all we can do is to use builder and rely
2429   // on InstCombine for future simplifications. Here we handle some trivial
2430   // cases only.
2431   auto CreateAdd = [&B](Value *X, Value *Y) {
2432     assert(X->getType() == Y->getType() && "Types don't match!");
2433     if (auto *CX = dyn_cast<ConstantInt>(X))
2434       if (CX->isZero())
2435         return Y;
2436     if (auto *CY = dyn_cast<ConstantInt>(Y))
2437       if (CY->isZero())
2438         return X;
2439     return B.CreateAdd(X, Y);
2440   };
2441 
2442   // We allow X to be a vector type, in which case Y will potentially be
2443   // splatted into a vector with the same element count.
2444   auto CreateMul = [&B](Value *X, Value *Y) {
2445     assert(X->getType()->getScalarType() == Y->getType() &&
2446            "Types don't match!");
2447     if (auto *CX = dyn_cast<ConstantInt>(X))
2448       if (CX->isOne())
2449         return Y;
2450     if (auto *CY = dyn_cast<ConstantInt>(Y))
2451       if (CY->isOne())
2452         return X;
2453     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2454     if (XVTy && !isa<VectorType>(Y->getType()))
2455       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2456     return B.CreateMul(X, Y);
2457   };
2458 
2459   switch (ID.getKind()) {
2460   case InductionDescriptor::IK_IntInduction: {
2461     assert(!isa<VectorType>(Index->getType()) &&
2462            "Vector indices not supported for integer inductions yet");
2463     assert(Index->getType() == StartValue->getType() &&
2464            "Index type does not match StartValue type");
2465     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2466       return B.CreateSub(StartValue, Index);
2467     auto *Offset = CreateMul(Index, Step);
2468     return CreateAdd(StartValue, Offset);
2469   }
2470   case InductionDescriptor::IK_PtrInduction: {
2471     assert(isa<Constant>(Step) &&
2472            "Expected constant step for pointer induction");
2473     return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2474   }
2475   case InductionDescriptor::IK_FpInduction: {
2476     assert(!isa<VectorType>(Index->getType()) &&
2477            "Vector indices not supported for FP inductions yet");
2478     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2479     auto InductionBinOp = ID.getInductionBinOp();
2480     assert(InductionBinOp &&
2481            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2482             InductionBinOp->getOpcode() == Instruction::FSub) &&
2483            "Original bin op should be defined for FP induction");
2484 
2485     Value *MulExp = B.CreateFMul(Step, Index);
2486     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2487                          "induction");
2488   }
2489   case InductionDescriptor::IK_NoInduction:
2490     return nullptr;
2491   }
2492   llvm_unreachable("invalid enum");
2493 }
2494 
2495 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2496                                                     const VPIteration &Instance,
2497                                                     VPTransformState &State) {
2498   Value *ScalarInst = State.get(Def, Instance);
2499   Value *VectorValue = State.get(Def, Instance.Part);
2500   VectorValue = Builder.CreateInsertElement(
2501       VectorValue, ScalarInst,
2502       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2503   State.set(Def, VectorValue, Instance.Part);
2504 }
2505 
2506 // Return whether we allow using masked interleave-groups (for dealing with
2507 // strided loads/stores that reside in predicated blocks, or for dealing
2508 // with gaps).
2509 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2510   // If an override option has been passed in for interleaved accesses, use it.
2511   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2512     return EnableMaskedInterleavedMemAccesses;
2513 
2514   return TTI.enableMaskedInterleavedAccessVectorization();
2515 }
2516 
2517 // Try to vectorize the interleave group that \p Instr belongs to.
2518 //
2519 // E.g. Translate following interleaved load group (factor = 3):
2520 //   for (i = 0; i < N; i+=3) {
2521 //     R = Pic[i];             // Member of index 0
2522 //     G = Pic[i+1];           // Member of index 1
2523 //     B = Pic[i+2];           // Member of index 2
2524 //     ... // do something to R, G, B
2525 //   }
2526 // To:
2527 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2528 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2529 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2530 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2531 //
2532 // Or translate following interleaved store group (factor = 3):
2533 //   for (i = 0; i < N; i+=3) {
2534 //     ... do something to R, G, B
2535 //     Pic[i]   = R;           // Member of index 0
2536 //     Pic[i+1] = G;           // Member of index 1
2537 //     Pic[i+2] = B;           // Member of index 2
2538 //   }
2539 // To:
2540 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2541 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2542 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2543 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2544 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2545 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2546     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2547     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2548     VPValue *BlockInMask) {
2549   Instruction *Instr = Group->getInsertPos();
2550   const DataLayout &DL = Instr->getModule()->getDataLayout();
2551 
2552   // Prepare for the vector type of the interleaved load/store.
2553   Type *ScalarTy = getLoadStoreType(Instr);
2554   unsigned InterleaveFactor = Group->getFactor();
2555   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2556   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2557 
2558   // Prepare for the new pointers.
2559   SmallVector<Value *, 2> AddrParts;
2560   unsigned Index = Group->getIndex(Instr);
2561 
2562   // TODO: extend the masked interleaved-group support to reversed access.
2563   assert((!BlockInMask || !Group->isReverse()) &&
2564          "Reversed masked interleave-group not supported.");
2565 
2566   // If the group is reverse, adjust the index to refer to the last vector lane
2567   // instead of the first. We adjust the index from the first vector lane,
2568   // rather than directly getting the pointer for lane VF - 1, because the
2569   // pointer operand of the interleaved access is supposed to be uniform. For
2570   // uniform instructions, we're only required to generate a value for the
2571   // first vector lane in each unroll iteration.
2572   if (Group->isReverse())
2573     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2574 
2575   for (unsigned Part = 0; Part < UF; Part++) {
2576     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2577     State.setDebugLocFromInst(AddrPart);
2578 
2579     // Notice current instruction could be any index. Need to adjust the address
2580     // to the member of index 0.
2581     //
2582     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2583     //       b = A[i];       // Member of index 0
2584     // Current pointer is pointed to A[i+1], adjust it to A[i].
2585     //
2586     // E.g.  A[i+1] = a;     // Member of index 1
2587     //       A[i]   = b;     // Member of index 0
2588     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2589     // Current pointer is pointed to A[i+2], adjust it to A[i].
2590 
2591     bool InBounds = false;
2592     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2593       InBounds = gep->isInBounds();
2594     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2595     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2596 
2597     // Cast to the vector pointer type.
2598     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2599     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2600     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2601   }
2602 
2603   State.setDebugLocFromInst(Instr);
2604   Value *PoisonVec = PoisonValue::get(VecTy);
2605 
2606   Value *MaskForGaps = nullptr;
2607   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2608     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2609     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2610   }
2611 
2612   // Vectorize the interleaved load group.
2613   if (isa<LoadInst>(Instr)) {
2614     // For each unroll part, create a wide load for the group.
2615     SmallVector<Value *, 2> NewLoads;
2616     for (unsigned Part = 0; Part < UF; Part++) {
2617       Instruction *NewLoad;
2618       if (BlockInMask || MaskForGaps) {
2619         assert(useMaskedInterleavedAccesses(*TTI) &&
2620                "masked interleaved groups are not allowed.");
2621         Value *GroupMask = MaskForGaps;
2622         if (BlockInMask) {
2623           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2624           Value *ShuffledMask = Builder.CreateShuffleVector(
2625               BlockInMaskPart,
2626               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2627               "interleaved.mask");
2628           GroupMask = MaskForGaps
2629                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2630                                                 MaskForGaps)
2631                           : ShuffledMask;
2632         }
2633         NewLoad =
2634             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2635                                      GroupMask, PoisonVec, "wide.masked.vec");
2636       }
2637       else
2638         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2639                                             Group->getAlign(), "wide.vec");
2640       Group->addMetadata(NewLoad);
2641       NewLoads.push_back(NewLoad);
2642     }
2643 
2644     // For each member in the group, shuffle out the appropriate data from the
2645     // wide loads.
2646     unsigned J = 0;
2647     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2648       Instruction *Member = Group->getMember(I);
2649 
2650       // Skip the gaps in the group.
2651       if (!Member)
2652         continue;
2653 
2654       auto StrideMask =
2655           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2656       for (unsigned Part = 0; Part < UF; Part++) {
2657         Value *StridedVec = Builder.CreateShuffleVector(
2658             NewLoads[Part], StrideMask, "strided.vec");
2659 
2660         // If this member has different type, cast the result type.
2661         if (Member->getType() != ScalarTy) {
2662           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2663           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2664           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2665         }
2666 
2667         if (Group->isReverse())
2668           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2669 
2670         State.set(VPDefs[J], StridedVec, Part);
2671       }
2672       ++J;
2673     }
2674     return;
2675   }
2676 
2677   // The sub vector type for current instruction.
2678   auto *SubVT = VectorType::get(ScalarTy, VF);
2679 
2680   // Vectorize the interleaved store group.
2681   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2682   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2683          "masked interleaved groups are not allowed.");
2684   assert((!MaskForGaps || !VF.isScalable()) &&
2685          "masking gaps for scalable vectors is not yet supported.");
2686   for (unsigned Part = 0; Part < UF; Part++) {
2687     // Collect the stored vector from each member.
2688     SmallVector<Value *, 4> StoredVecs;
2689     for (unsigned i = 0; i < InterleaveFactor; i++) {
2690       assert((Group->getMember(i) || MaskForGaps) &&
2691              "Fail to get a member from an interleaved store group");
2692       Instruction *Member = Group->getMember(i);
2693 
2694       // Skip the gaps in the group.
2695       if (!Member) {
2696         Value *Undef = PoisonValue::get(SubVT);
2697         StoredVecs.push_back(Undef);
2698         continue;
2699       }
2700 
2701       Value *StoredVec = State.get(StoredValues[i], Part);
2702 
2703       if (Group->isReverse())
2704         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2705 
2706       // If this member has different type, cast it to a unified type.
2707 
2708       if (StoredVec->getType() != SubVT)
2709         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2710 
2711       StoredVecs.push_back(StoredVec);
2712     }
2713 
2714     // Concatenate all vectors into a wide vector.
2715     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2716 
2717     // Interleave the elements in the wide vector.
2718     Value *IVec = Builder.CreateShuffleVector(
2719         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2720         "interleaved.vec");
2721 
2722     Instruction *NewStoreInstr;
2723     if (BlockInMask || MaskForGaps) {
2724       Value *GroupMask = MaskForGaps;
2725       if (BlockInMask) {
2726         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2727         Value *ShuffledMask = Builder.CreateShuffleVector(
2728             BlockInMaskPart,
2729             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2730             "interleaved.mask");
2731         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2732                                                       ShuffledMask, MaskForGaps)
2733                                 : ShuffledMask;
2734       }
2735       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2736                                                 Group->getAlign(), GroupMask);
2737     } else
2738       NewStoreInstr =
2739           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2740 
2741     Group->addMetadata(NewStoreInstr);
2742   }
2743 }
2744 
2745 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2746                                                VPReplicateRecipe *RepRecipe,
2747                                                const VPIteration &Instance,
2748                                                bool IfPredicateInstr,
2749                                                VPTransformState &State) {
2750   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2751 
2752   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2753   // the first lane and part.
2754   if (isa<NoAliasScopeDeclInst>(Instr))
2755     if (!Instance.isFirstIteration())
2756       return;
2757 
2758   // Does this instruction return a value ?
2759   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2760 
2761   Instruction *Cloned = Instr->clone();
2762   if (!IsVoidRetTy)
2763     Cloned->setName(Instr->getName() + ".cloned");
2764 
2765   // If the scalarized instruction contributes to the address computation of a
2766   // widen masked load/store which was in a basic block that needed predication
2767   // and is not predicated after vectorization, we can't propagate
2768   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2769   // instruction could feed a poison value to the base address of the widen
2770   // load/store.
2771   if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2772     Cloned->dropPoisonGeneratingFlags();
2773 
2774   if (Instr->getDebugLoc())
2775     State.setDebugLocFromInst(Instr);
2776 
2777   // Replace the operands of the cloned instructions with their scalar
2778   // equivalents in the new loop.
2779   for (auto &I : enumerate(RepRecipe->operands())) {
2780     auto InputInstance = Instance;
2781     VPValue *Operand = I.value();
2782     VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand);
2783     if (OperandR && OperandR->isUniform())
2784       InputInstance.Lane = VPLane::getFirstLane();
2785     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2786   }
2787   State.addNewMetadata(Cloned, Instr);
2788 
2789   // Place the cloned scalar in the new loop.
2790   State.Builder.Insert(Cloned);
2791 
2792   State.set(RepRecipe, Cloned, Instance);
2793 
2794   // If we just cloned a new assumption, add it the assumption cache.
2795   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2796     AC->registerAssumption(II);
2797 
2798   // End if-block.
2799   if (IfPredicateInstr)
2800     PredicatedInstructions.push_back(Cloned);
2801 }
2802 
2803 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
2804   if (TripCount)
2805     return TripCount;
2806 
2807   assert(InsertBlock);
2808   IRBuilder<> Builder(InsertBlock->getTerminator());
2809   // Find the loop boundaries.
2810   ScalarEvolution *SE = PSE.getSE();
2811   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2812   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2813          "Invalid loop count");
2814 
2815   Type *IdxTy = Legal->getWidestInductionType();
2816   assert(IdxTy && "No type for induction");
2817 
2818   // The exit count might have the type of i64 while the phi is i32. This can
2819   // happen if we have an induction variable that is sign extended before the
2820   // compare. The only way that we get a backedge taken count is that the
2821   // induction variable was signed and as such will not overflow. In such a case
2822   // truncation is legal.
2823   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2824       IdxTy->getPrimitiveSizeInBits())
2825     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2826   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2827 
2828   // Get the total trip count from the count by adding 1.
2829   const SCEV *ExitCount = SE->getAddExpr(
2830       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2831 
2832   const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
2833 
2834   // Expand the trip count and place the new instructions in the preheader.
2835   // Notice that the pre-header does not change, only the loop body.
2836   SCEVExpander Exp(*SE, DL, "induction");
2837 
2838   // Count holds the overall loop count (N).
2839   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2840                                 InsertBlock->getTerminator());
2841 
2842   if (TripCount->getType()->isPointerTy())
2843     TripCount =
2844         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2845                                     InsertBlock->getTerminator());
2846 
2847   return TripCount;
2848 }
2849 
2850 Value *
2851 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2852   if (VectorTripCount)
2853     return VectorTripCount;
2854 
2855   Value *TC = getOrCreateTripCount(InsertBlock);
2856   IRBuilder<> Builder(InsertBlock->getTerminator());
2857 
2858   Type *Ty = TC->getType();
2859   // This is where we can make the step a runtime constant.
2860   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2861 
2862   // If the tail is to be folded by masking, round the number of iterations N
2863   // up to a multiple of Step instead of rounding down. This is done by first
2864   // adding Step-1 and then rounding down. Note that it's ok if this addition
2865   // overflows: the vector induction variable will eventually wrap to zero given
2866   // that it starts at zero and its Step is a power of two; the loop will then
2867   // exit, with the last early-exit vector comparison also producing all-true.
2868   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2869   // is accounted for in emitIterationCountCheck that adds an overflow check.
2870   if (Cost->foldTailByMasking()) {
2871     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2872            "VF*UF must be a power of 2 when folding tail by masking");
2873     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2874     TC = Builder.CreateAdd(
2875         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2876   }
2877 
2878   // Now we need to generate the expression for the part of the loop that the
2879   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2880   // iterations are not required for correctness, or N - Step, otherwise. Step
2881   // is equal to the vectorization factor (number of SIMD elements) times the
2882   // unroll factor (number of SIMD instructions).
2883   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2884 
2885   // There are cases where we *must* run at least one iteration in the remainder
2886   // loop.  See the cost model for when this can happen.  If the step evenly
2887   // divides the trip count, we set the remainder to be equal to the step. If
2888   // the step does not evenly divide the trip count, no adjustment is necessary
2889   // since there will already be scalar iterations. Note that the minimum
2890   // iterations check ensures that N >= Step.
2891   if (Cost->requiresScalarEpilogue(VF)) {
2892     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2893     R = Builder.CreateSelect(IsZero, Step, R);
2894   }
2895 
2896   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2897 
2898   return VectorTripCount;
2899 }
2900 
2901 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2902                                                    const DataLayout &DL) {
2903   // Verify that V is a vector type with same number of elements as DstVTy.
2904   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2905   unsigned VF = DstFVTy->getNumElements();
2906   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2907   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2908   Type *SrcElemTy = SrcVecTy->getElementType();
2909   Type *DstElemTy = DstFVTy->getElementType();
2910   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2911          "Vector elements must have same size");
2912 
2913   // Do a direct cast if element types are castable.
2914   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2915     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2916   }
2917   // V cannot be directly casted to desired vector type.
2918   // May happen when V is a floating point vector but DstVTy is a vector of
2919   // pointers or vice-versa. Handle this using a two-step bitcast using an
2920   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2921   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2922          "Only one type should be a pointer type");
2923   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2924          "Only one type should be a floating point type");
2925   Type *IntTy =
2926       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2927   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2928   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2929   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2930 }
2931 
2932 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2933   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
2934   // Reuse existing vector loop preheader for TC checks.
2935   // Note that new preheader block is generated for vector loop.
2936   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2937   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2938 
2939   // Generate code to check if the loop's trip count is less than VF * UF, or
2940   // equal to it in case a scalar epilogue is required; this implies that the
2941   // vector trip count is zero. This check also covers the case where adding one
2942   // to the backedge-taken count overflowed leading to an incorrect trip count
2943   // of zero. In this case we will also jump to the scalar loop.
2944   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2945                                             : ICmpInst::ICMP_ULT;
2946 
2947   // If tail is to be folded, vector loop takes care of all iterations.
2948   Type *CountTy = Count->getType();
2949   Value *CheckMinIters = Builder.getFalse();
2950   auto CreateStep = [&]() {
2951     // Create step with max(MinProTripCount, UF * VF).
2952     if (UF * VF.getKnownMinValue() < MinProfitableTripCount.getKnownMinValue())
2953       return createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2954     return createStepForVF(Builder, CountTy, VF, UF);
2955   };
2956 
2957   if (!Cost->foldTailByMasking())
2958     CheckMinIters =
2959         Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2960   else if (VF.isScalable()) {
2961     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2962     // an overflow to zero when updating induction variables and so an
2963     // additional overflow check is required before entering the vector loop.
2964 
2965     // Get the maximum unsigned value for the type.
2966     Value *MaxUIntTripCount =
2967         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2968     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2969 
2970     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2971     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2972   }
2973 
2974   // Create new preheader for vector loop.
2975   LoopVectorPreHeader =
2976       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2977                  "vector.ph");
2978 
2979   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2980                                DT->getNode(Bypass)->getIDom()) &&
2981          "TC check is expected to dominate Bypass");
2982 
2983   // Update dominator for Bypass & LoopExit (if needed).
2984   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2985   if (!Cost->requiresScalarEpilogue(VF))
2986     // If there is an epilogue which must run, there's no edge from the
2987     // middle block to exit blocks  and thus no need to update the immediate
2988     // dominator of the exit blocks.
2989     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2990 
2991   ReplaceInstWithInst(
2992       TCCheckBlock->getTerminator(),
2993       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2994   LoopBypassBlocks.push_back(TCCheckBlock);
2995 }
2996 
2997 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2998   BasicBlock *const SCEVCheckBlock =
2999       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
3000   if (!SCEVCheckBlock)
3001     return nullptr;
3002 
3003   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3004            (OptForSizeBasedOnProfile &&
3005             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3006          "Cannot SCEV check stride or overflow when optimizing for size");
3007 
3008 
3009   // Update dominator only if this is first RT check.
3010   if (LoopBypassBlocks.empty()) {
3011     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3012     if (!Cost->requiresScalarEpilogue(VF))
3013       // If there is an epilogue which must run, there's no edge from the
3014       // middle block to exit blocks  and thus no need to update the immediate
3015       // dominator of the exit blocks.
3016       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3017   }
3018 
3019   LoopBypassBlocks.push_back(SCEVCheckBlock);
3020   AddedSafetyChecks = true;
3021   return SCEVCheckBlock;
3022 }
3023 
3024 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
3025   // VPlan-native path does not do any analysis for runtime checks currently.
3026   if (EnableVPlanNativePath)
3027     return nullptr;
3028 
3029   BasicBlock *const MemCheckBlock =
3030       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
3031 
3032   // Check if we generated code that checks in runtime if arrays overlap. We put
3033   // the checks into a separate block to make the more common case of few
3034   // elements faster.
3035   if (!MemCheckBlock)
3036     return nullptr;
3037 
3038   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3039     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3040            "Cannot emit memory checks when optimizing for size, unless forced "
3041            "to vectorize.");
3042     ORE->emit([&]() {
3043       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3044                                         OrigLoop->getStartLoc(),
3045                                         OrigLoop->getHeader())
3046              << "Code-size may be reduced by not forcing "
3047                 "vectorization, or by source-code modifications "
3048                 "eliminating the need for runtime checks "
3049                 "(e.g., adding 'restrict').";
3050     });
3051   }
3052 
3053   LoopBypassBlocks.push_back(MemCheckBlock);
3054 
3055   AddedSafetyChecks = true;
3056 
3057   return MemCheckBlock;
3058 }
3059 
3060 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3061   LoopScalarBody = OrigLoop->getHeader();
3062   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3063   assert(LoopVectorPreHeader && "Invalid loop structure");
3064   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3065   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3066          "multiple exit loop without required epilogue?");
3067 
3068   LoopMiddleBlock =
3069       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3070                  LI, nullptr, Twine(Prefix) + "middle.block");
3071   LoopScalarPreHeader =
3072       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3073                  nullptr, Twine(Prefix) + "scalar.ph");
3074 
3075   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3076 
3077   // Set up the middle block terminator.  Two cases:
3078   // 1) If we know that we must execute the scalar epilogue, emit an
3079   //    unconditional branch.
3080   // 2) Otherwise, we must have a single unique exit block (due to how we
3081   //    implement the multiple exit case).  In this case, set up a conditonal
3082   //    branch from the middle block to the loop scalar preheader, and the
3083   //    exit block.  completeLoopSkeleton will update the condition to use an
3084   //    iteration check, if required to decide whether to execute the remainder.
3085   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3086     BranchInst::Create(LoopScalarPreHeader) :
3087     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3088                        Builder.getTrue());
3089   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3090   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3091 
3092   // Update dominator for loop exit. During skeleton creation, only the vector
3093   // pre-header and the middle block are created. The vector loop is entirely
3094   // created during VPlan exection.
3095   if (!Cost->requiresScalarEpilogue(VF))
3096     // If there is an epilogue which must run, there's no edge from the
3097     // middle block to exit blocks  and thus no need to update the immediate
3098     // dominator of the exit blocks.
3099     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3100 }
3101 
3102 void InnerLoopVectorizer::createInductionResumeValues(
3103     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3104   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3105           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3106          "Inconsistent information about additional bypass.");
3107 
3108   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3109   assert(VectorTripCount && "Expected valid arguments");
3110   // We are going to resume the execution of the scalar loop.
3111   // Go over all of the induction variables that we found and fix the
3112   // PHIs that are left in the scalar version of the loop.
3113   // The starting values of PHI nodes depend on the counter of the last
3114   // iteration in the vectorized loop.
3115   // If we come from a bypass edge then we need to start from the original
3116   // start value.
3117   Instruction *OldInduction = Legal->getPrimaryInduction();
3118   for (auto &InductionEntry : Legal->getInductionVars()) {
3119     PHINode *OrigPhi = InductionEntry.first;
3120     InductionDescriptor II = InductionEntry.second;
3121 
3122     Value *&EndValue = IVEndValues[OrigPhi];
3123     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3124     if (OrigPhi == OldInduction) {
3125       // We know what the end value is.
3126       EndValue = VectorTripCount;
3127     } else {
3128       IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3129 
3130       // Fast-math-flags propagate from the original induction instruction.
3131       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3132         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3133 
3134       Type *StepType = II.getStep()->getType();
3135       Instruction::CastOps CastOp =
3136           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3137       Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc");
3138       Value *Step =
3139           CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3140       EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
3141       EndValue->setName("ind.end");
3142 
3143       // Compute the end value for the additional bypass (if applicable).
3144       if (AdditionalBypass.first) {
3145         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3146         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3147                                          StepType, true);
3148         Value *Step =
3149             CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3150         VTC =
3151             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc");
3152         EndValueFromAdditionalBypass =
3153             emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
3154         EndValueFromAdditionalBypass->setName("ind.end");
3155       }
3156     }
3157 
3158     // Create phi nodes to merge from the  backedge-taken check block.
3159     PHINode *BCResumeVal =
3160         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3161                         LoopScalarPreHeader->getTerminator());
3162     // Copy original phi DL over to the new one.
3163     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3164 
3165     // The new PHI merges the original incoming value, in case of a bypass,
3166     // or the value at the end of the vectorized loop.
3167     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3168 
3169     // Fix the scalar body counter (PHI node).
3170     // The old induction's phi node in the scalar body needs the truncated
3171     // value.
3172     for (BasicBlock *BB : LoopBypassBlocks)
3173       BCResumeVal->addIncoming(II.getStartValue(), BB);
3174 
3175     if (AdditionalBypass.first)
3176       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3177                                             EndValueFromAdditionalBypass);
3178 
3179     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3180   }
3181 }
3182 
3183 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) {
3184   // The trip counts should be cached by now.
3185   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
3186   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3187 
3188   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3189 
3190   // Add a check in the middle block to see if we have completed
3191   // all of the iterations in the first vector loop.  Three cases:
3192   // 1) If we require a scalar epilogue, there is no conditional branch as
3193   //    we unconditionally branch to the scalar preheader.  Do nothing.
3194   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3195   //    Thus if tail is to be folded, we know we don't need to run the
3196   //    remainder and we can use the previous value for the condition (true).
3197   // 3) Otherwise, construct a runtime check.
3198   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3199     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3200                                         Count, VectorTripCount, "cmp.n",
3201                                         LoopMiddleBlock->getTerminator());
3202 
3203     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3204     // of the corresponding compare because they may have ended up with
3205     // different line numbers and we want to avoid awkward line stepping while
3206     // debugging. Eg. if the compare has got a line number inside the loop.
3207     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3208     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3209   }
3210 
3211 #ifdef EXPENSIVE_CHECKS
3212   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3213 #endif
3214 
3215   return LoopVectorPreHeader;
3216 }
3217 
3218 std::pair<BasicBlock *, Value *>
3219 InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3220   /*
3221    In this function we generate a new loop. The new loop will contain
3222    the vectorized instructions while the old loop will continue to run the
3223    scalar remainder.
3224 
3225        [ ] <-- loop iteration number check.
3226     /   |
3227    /    v
3228   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3229   |  /  |
3230   | /   v
3231   ||   [ ]     <-- vector pre header.
3232   |/    |
3233   |     v
3234   |    [  ] \
3235   |    [  ]_|   <-- vector loop (created during VPlan execution).
3236   |     |
3237   |     v
3238   \   -[ ]   <--- middle-block.
3239    \/   |
3240    /\   v
3241    | ->[ ]     <--- new preheader.
3242    |    |
3243  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3244    |   [ ] \
3245    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3246     \   |
3247      \  v
3248       >[ ]     <-- exit block(s).
3249    ...
3250    */
3251 
3252   // Get the metadata of the original loop before it gets modified.
3253   MDNode *OrigLoopID = OrigLoop->getLoopID();
3254 
3255   // Workaround!  Compute the trip count of the original loop and cache it
3256   // before we start modifying the CFG.  This code has a systemic problem
3257   // wherein it tries to run analysis over partially constructed IR; this is
3258   // wrong, and not simply for SCEV.  The trip count of the original loop
3259   // simply happens to be prone to hitting this in practice.  In theory, we
3260   // can hit the same issue for any SCEV, or ValueTracking query done during
3261   // mutation.  See PR49900.
3262   getOrCreateTripCount(OrigLoop->getLoopPreheader());
3263 
3264   // Create an empty vector loop, and prepare basic blocks for the runtime
3265   // checks.
3266   createVectorLoopSkeleton("");
3267 
3268   // Now, compare the new count to zero. If it is zero skip the vector loop and
3269   // jump to the scalar loop. This check also covers the case where the
3270   // backedge-taken count is uint##_max: adding one to it will overflow leading
3271   // to an incorrect trip count of zero. In this (rare) case we will also jump
3272   // to the scalar loop.
3273   emitIterationCountCheck(LoopScalarPreHeader);
3274 
3275   // Generate the code to check any assumptions that we've made for SCEV
3276   // expressions.
3277   emitSCEVChecks(LoopScalarPreHeader);
3278 
3279   // Generate the code that checks in runtime if arrays overlap. We put the
3280   // checks into a separate block to make the more common case of few elements
3281   // faster.
3282   emitMemRuntimeChecks(LoopScalarPreHeader);
3283 
3284   // Emit phis for the new starting index of the scalar loop.
3285   createInductionResumeValues();
3286 
3287   return {completeLoopSkeleton(OrigLoopID), nullptr};
3288 }
3289 
3290 // Fix up external users of the induction variable. At this point, we are
3291 // in LCSSA form, with all external PHIs that use the IV having one input value,
3292 // coming from the remainder loop. We need those PHIs to also have a correct
3293 // value for the IV when arriving directly from the middle block.
3294 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3295                                        const InductionDescriptor &II,
3296                                        Value *VectorTripCount, Value *EndValue,
3297                                        BasicBlock *MiddleBlock,
3298                                        BasicBlock *VectorHeader, VPlan &Plan) {
3299   // There are two kinds of external IV usages - those that use the value
3300   // computed in the last iteration (the PHI) and those that use the penultimate
3301   // value (the value that feeds into the phi from the loop latch).
3302   // We allow both, but they, obviously, have different values.
3303 
3304   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3305 
3306   DenseMap<Value *, Value *> MissingVals;
3307 
3308   // An external user of the last iteration's value should see the value that
3309   // the remainder loop uses to initialize its own IV.
3310   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3311   for (User *U : PostInc->users()) {
3312     Instruction *UI = cast<Instruction>(U);
3313     if (!OrigLoop->contains(UI)) {
3314       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3315       MissingVals[UI] = EndValue;
3316     }
3317   }
3318 
3319   // An external user of the penultimate value need to see EndValue - Step.
3320   // The simplest way to get this is to recompute it from the constituent SCEVs,
3321   // that is Start + (Step * (CRD - 1)).
3322   for (User *U : OrigPhi->users()) {
3323     auto *UI = cast<Instruction>(U);
3324     if (!OrigLoop->contains(UI)) {
3325       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3326 
3327       IRBuilder<> B(MiddleBlock->getTerminator());
3328 
3329       // Fast-math-flags propagate from the original induction instruction.
3330       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3331         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3332 
3333       Value *CountMinusOne = B.CreateSub(
3334           VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3335       Value *CMO =
3336           !II.getStep()->getType()->isIntegerTy()
3337               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3338                              II.getStep()->getType())
3339               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3340       CMO->setName("cast.cmo");
3341 
3342       Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3343                                     VectorHeader->getTerminator());
3344       Value *Escape =
3345           emitTransformedIndex(B, CMO, II.getStartValue(), Step, II);
3346       Escape->setName("ind.escape");
3347       MissingVals[UI] = Escape;
3348     }
3349   }
3350 
3351   for (auto &I : MissingVals) {
3352     PHINode *PHI = cast<PHINode>(I.first);
3353     // One corner case we have to handle is two IVs "chasing" each-other,
3354     // that is %IV2 = phi [...], [ %IV1, %latch ]
3355     // In this case, if IV1 has an external use, we need to avoid adding both
3356     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3357     // don't already have an incoming value for the middle block.
3358     if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3359       PHI->addIncoming(I.second, MiddleBlock);
3360       Plan.removeLiveOut(PHI);
3361     }
3362   }
3363 }
3364 
3365 namespace {
3366 
3367 struct CSEDenseMapInfo {
3368   static bool canHandle(const Instruction *I) {
3369     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3370            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3371   }
3372 
3373   static inline Instruction *getEmptyKey() {
3374     return DenseMapInfo<Instruction *>::getEmptyKey();
3375   }
3376 
3377   static inline Instruction *getTombstoneKey() {
3378     return DenseMapInfo<Instruction *>::getTombstoneKey();
3379   }
3380 
3381   static unsigned getHashValue(const Instruction *I) {
3382     assert(canHandle(I) && "Unknown instruction!");
3383     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3384                                                            I->value_op_end()));
3385   }
3386 
3387   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3388     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3389         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3390       return LHS == RHS;
3391     return LHS->isIdenticalTo(RHS);
3392   }
3393 };
3394 
3395 } // end anonymous namespace
3396 
3397 ///Perform cse of induction variable instructions.
3398 static void cse(BasicBlock *BB) {
3399   // Perform simple cse.
3400   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3401   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3402     if (!CSEDenseMapInfo::canHandle(&In))
3403       continue;
3404 
3405     // Check if we can replace this instruction with any of the
3406     // visited instructions.
3407     if (Instruction *V = CSEMap.lookup(&In)) {
3408       In.replaceAllUsesWith(V);
3409       In.eraseFromParent();
3410       continue;
3411     }
3412 
3413     CSEMap[&In] = &In;
3414   }
3415 }
3416 
3417 InstructionCost
3418 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3419                                               bool &NeedToScalarize) const {
3420   Function *F = CI->getCalledFunction();
3421   Type *ScalarRetTy = CI->getType();
3422   SmallVector<Type *, 4> Tys, ScalarTys;
3423   for (auto &ArgOp : CI->args())
3424     ScalarTys.push_back(ArgOp->getType());
3425 
3426   // Estimate cost of scalarized vector call. The source operands are assumed
3427   // to be vectors, so we need to extract individual elements from there,
3428   // execute VF scalar calls, and then gather the result into the vector return
3429   // value.
3430   InstructionCost ScalarCallCost =
3431       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3432   if (VF.isScalar())
3433     return ScalarCallCost;
3434 
3435   // Compute corresponding vector type for return value and arguments.
3436   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3437   for (Type *ScalarTy : ScalarTys)
3438     Tys.push_back(ToVectorTy(ScalarTy, VF));
3439 
3440   // Compute costs of unpacking argument values for the scalar calls and
3441   // packing the return values to a vector.
3442   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3443 
3444   InstructionCost Cost =
3445       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3446 
3447   // If we can't emit a vector call for this function, then the currently found
3448   // cost is the cost we need to return.
3449   NeedToScalarize = true;
3450   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3451   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3452 
3453   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3454     return Cost;
3455 
3456   // If the corresponding vector cost is cheaper, return its cost.
3457   InstructionCost VectorCallCost =
3458       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3459   if (VectorCallCost < Cost) {
3460     NeedToScalarize = false;
3461     Cost = VectorCallCost;
3462   }
3463   return Cost;
3464 }
3465 
3466 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3467   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3468     return Elt;
3469   return VectorType::get(Elt, VF);
3470 }
3471 
3472 InstructionCost
3473 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3474                                                    ElementCount VF) const {
3475   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3476   assert(ID && "Expected intrinsic call!");
3477   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3478   FastMathFlags FMF;
3479   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3480     FMF = FPMO->getFastMathFlags();
3481 
3482   SmallVector<const Value *> Arguments(CI->args());
3483   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3484   SmallVector<Type *> ParamTys;
3485   std::transform(FTy->param_begin(), FTy->param_end(),
3486                  std::back_inserter(ParamTys),
3487                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3488 
3489   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3490                                     dyn_cast<IntrinsicInst>(CI));
3491   return TTI.getIntrinsicInstrCost(CostAttrs,
3492                                    TargetTransformInfo::TCK_RecipThroughput);
3493 }
3494 
3495 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3496   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3497   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3498   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3499 }
3500 
3501 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3502   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3503   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3504   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3505 }
3506 
3507 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3508   // For every instruction `I` in MinBWs, truncate the operands, create a
3509   // truncated version of `I` and reextend its result. InstCombine runs
3510   // later and will remove any ext/trunc pairs.
3511   SmallPtrSet<Value *, 4> Erased;
3512   for (const auto &KV : Cost->getMinimalBitwidths()) {
3513     // If the value wasn't vectorized, we must maintain the original scalar
3514     // type. The absence of the value from State indicates that it
3515     // wasn't vectorized.
3516     // FIXME: Should not rely on getVPValue at this point.
3517     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3518     if (!State.hasAnyVectorValue(Def))
3519       continue;
3520     for (unsigned Part = 0; Part < UF; ++Part) {
3521       Value *I = State.get(Def, Part);
3522       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3523         continue;
3524       Type *OriginalTy = I->getType();
3525       Type *ScalarTruncatedTy =
3526           IntegerType::get(OriginalTy->getContext(), KV.second);
3527       auto *TruncatedTy = VectorType::get(
3528           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3529       if (TruncatedTy == OriginalTy)
3530         continue;
3531 
3532       IRBuilder<> B(cast<Instruction>(I));
3533       auto ShrinkOperand = [&](Value *V) -> Value * {
3534         if (auto *ZI = dyn_cast<ZExtInst>(V))
3535           if (ZI->getSrcTy() == TruncatedTy)
3536             return ZI->getOperand(0);
3537         return B.CreateZExtOrTrunc(V, TruncatedTy);
3538       };
3539 
3540       // The actual instruction modification depends on the instruction type,
3541       // unfortunately.
3542       Value *NewI = nullptr;
3543       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3544         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3545                              ShrinkOperand(BO->getOperand(1)));
3546 
3547         // Any wrapping introduced by shrinking this operation shouldn't be
3548         // considered undefined behavior. So, we can't unconditionally copy
3549         // arithmetic wrapping flags to NewI.
3550         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3551       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3552         NewI =
3553             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3554                          ShrinkOperand(CI->getOperand(1)));
3555       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3556         NewI = B.CreateSelect(SI->getCondition(),
3557                               ShrinkOperand(SI->getTrueValue()),
3558                               ShrinkOperand(SI->getFalseValue()));
3559       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3560         switch (CI->getOpcode()) {
3561         default:
3562           llvm_unreachable("Unhandled cast!");
3563         case Instruction::Trunc:
3564           NewI = ShrinkOperand(CI->getOperand(0));
3565           break;
3566         case Instruction::SExt:
3567           NewI = B.CreateSExtOrTrunc(
3568               CI->getOperand(0),
3569               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3570           break;
3571         case Instruction::ZExt:
3572           NewI = B.CreateZExtOrTrunc(
3573               CI->getOperand(0),
3574               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3575           break;
3576         }
3577       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3578         auto Elements0 =
3579             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3580         auto *O0 = B.CreateZExtOrTrunc(
3581             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3582         auto Elements1 =
3583             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3584         auto *O1 = B.CreateZExtOrTrunc(
3585             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3586 
3587         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3588       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3589         // Don't do anything with the operands, just extend the result.
3590         continue;
3591       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3592         auto Elements =
3593             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3594         auto *O0 = B.CreateZExtOrTrunc(
3595             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3596         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3597         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3598       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3599         auto Elements =
3600             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3601         auto *O0 = B.CreateZExtOrTrunc(
3602             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3603         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3604       } else {
3605         // If we don't know what to do, be conservative and don't do anything.
3606         continue;
3607       }
3608 
3609       // Lastly, extend the result.
3610       NewI->takeName(cast<Instruction>(I));
3611       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3612       I->replaceAllUsesWith(Res);
3613       cast<Instruction>(I)->eraseFromParent();
3614       Erased.insert(I);
3615       State.reset(Def, Res, Part);
3616     }
3617   }
3618 
3619   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3620   for (const auto &KV : Cost->getMinimalBitwidths()) {
3621     // If the value wasn't vectorized, we must maintain the original scalar
3622     // type. The absence of the value from State indicates that it
3623     // wasn't vectorized.
3624     // FIXME: Should not rely on getVPValue at this point.
3625     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3626     if (!State.hasAnyVectorValue(Def))
3627       continue;
3628     for (unsigned Part = 0; Part < UF; ++Part) {
3629       Value *I = State.get(Def, Part);
3630       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3631       if (Inst && Inst->use_empty()) {
3632         Value *NewI = Inst->getOperand(0);
3633         Inst->eraseFromParent();
3634         State.reset(Def, NewI, Part);
3635       }
3636     }
3637   }
3638 }
3639 
3640 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3641                                             VPlan &Plan) {
3642   // Insert truncates and extends for any truncated instructions as hints to
3643   // InstCombine.
3644   if (VF.isVector())
3645     truncateToMinimalBitwidths(State);
3646 
3647   // Fix widened non-induction PHIs by setting up the PHI operands.
3648   if (EnableVPlanNativePath)
3649     fixNonInductionPHIs(Plan, State);
3650 
3651   // At this point every instruction in the original loop is widened to a
3652   // vector form. Now we need to fix the recurrences in the loop. These PHI
3653   // nodes are currently empty because we did not want to introduce cycles.
3654   // This is the second stage of vectorizing recurrences.
3655   fixCrossIterationPHIs(State);
3656 
3657   // Forget the original basic block.
3658   PSE.getSE()->forgetLoop(OrigLoop);
3659 
3660   VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
3661   Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3662   if (Cost->requiresScalarEpilogue(VF)) {
3663     // No edge from the middle block to the unique exit block has been inserted
3664     // and there is nothing to fix from vector loop; phis should have incoming
3665     // from scalar loop only.
3666     Plan.clearLiveOuts();
3667   } else {
3668     // If we inserted an edge from the middle block to the unique exit block,
3669     // update uses outside the loop (phis) to account for the newly inserted
3670     // edge.
3671 
3672     // Fix-up external users of the induction variables.
3673     for (auto &Entry : Legal->getInductionVars())
3674       fixupIVUsers(Entry.first, Entry.second,
3675                    getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3676                    IVEndValues[Entry.first], LoopMiddleBlock,
3677                    VectorLoop->getHeader(), Plan);
3678   }
3679 
3680   // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3681   // in the exit block, so update the builder.
3682   State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
3683   for (auto &KV : Plan.getLiveOuts())
3684     KV.second->fixPhi(Plan, State);
3685 
3686   for (Instruction *PI : PredicatedInstructions)
3687     sinkScalarOperands(&*PI);
3688 
3689   // Remove redundant induction instructions.
3690   cse(VectorLoop->getHeader());
3691 
3692   // Set/update profile weights for the vector and remainder loops as original
3693   // loop iterations are now distributed among them. Note that original loop
3694   // represented by LoopScalarBody becomes remainder loop after vectorization.
3695   //
3696   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3697   // end up getting slightly roughened result but that should be OK since
3698   // profile is not inherently precise anyway. Note also possible bypass of
3699   // vector code caused by legality checks is ignored, assigning all the weight
3700   // to the vector loop, optimistically.
3701   //
3702   // For scalable vectorization we can't know at compile time how many iterations
3703   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3704   // vscale of '1'.
3705   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3706                                LI->getLoopFor(LoopScalarBody),
3707                                VF.getKnownMinValue() * UF);
3708 }
3709 
3710 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3711   // In order to support recurrences we need to be able to vectorize Phi nodes.
3712   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3713   // stage #2: We now need to fix the recurrences by adding incoming edges to
3714   // the currently empty PHI nodes. At this point every instruction in the
3715   // original loop is widened to a vector form so we can use them to construct
3716   // the incoming edges.
3717   VPBasicBlock *Header =
3718       State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3719   for (VPRecipeBase &R : Header->phis()) {
3720     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3721       fixReduction(ReductionPhi, State);
3722     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3723       fixFirstOrderRecurrence(FOR, State);
3724   }
3725 }
3726 
3727 void InnerLoopVectorizer::fixFirstOrderRecurrence(
3728     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3729   // This is the second phase of vectorizing first-order recurrences. An
3730   // overview of the transformation is described below. Suppose we have the
3731   // following loop.
3732   //
3733   //   for (int i = 0; i < n; ++i)
3734   //     b[i] = a[i] - a[i - 1];
3735   //
3736   // There is a first-order recurrence on "a". For this loop, the shorthand
3737   // scalar IR looks like:
3738   //
3739   //   scalar.ph:
3740   //     s_init = a[-1]
3741   //     br scalar.body
3742   //
3743   //   scalar.body:
3744   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3745   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3746   //     s2 = a[i]
3747   //     b[i] = s2 - s1
3748   //     br cond, scalar.body, ...
3749   //
3750   // In this example, s1 is a recurrence because it's value depends on the
3751   // previous iteration. In the first phase of vectorization, we created a
3752   // vector phi v1 for s1. We now complete the vectorization and produce the
3753   // shorthand vector IR shown below (for VF = 4, UF = 1).
3754   //
3755   //   vector.ph:
3756   //     v_init = vector(..., ..., ..., a[-1])
3757   //     br vector.body
3758   //
3759   //   vector.body
3760   //     i = phi [0, vector.ph], [i+4, vector.body]
3761   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3762   //     v2 = a[i, i+1, i+2, i+3];
3763   //     v3 = vector(v1(3), v2(0, 1, 2))
3764   //     b[i, i+1, i+2, i+3] = v2 - v3
3765   //     br cond, vector.body, middle.block
3766   //
3767   //   middle.block:
3768   //     x = v2(3)
3769   //     br scalar.ph
3770   //
3771   //   scalar.ph:
3772   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3773   //     br scalar.body
3774   //
3775   // After execution completes the vector loop, we extract the next value of
3776   // the recurrence (x) to use as the initial value in the scalar loop.
3777 
3778   // Extract the last vector element in the middle block. This will be the
3779   // initial value for the recurrence when jumping to the scalar loop.
3780   VPValue *PreviousDef = PhiR->getBackedgeValue();
3781   Value *Incoming = State.get(PreviousDef, UF - 1);
3782   auto *ExtractForScalar = Incoming;
3783   auto *IdxTy = Builder.getInt32Ty();
3784   if (VF.isVector()) {
3785     auto *One = ConstantInt::get(IdxTy, 1);
3786     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3787     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3788     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3789     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3790                                                     "vector.recur.extract");
3791   }
3792   // Extract the second last element in the middle block if the
3793   // Phi is used outside the loop. We need to extract the phi itself
3794   // and not the last element (the phi update in the current iteration). This
3795   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3796   // when the scalar loop is not run at all.
3797   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3798   if (VF.isVector()) {
3799     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3800     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3801     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3802         Incoming, Idx, "vector.recur.extract.for.phi");
3803   } else if (UF > 1)
3804     // When loop is unrolled without vectorizing, initialize
3805     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3806     // of `Incoming`. This is analogous to the vectorized case above: extracting
3807     // the second last element when VF > 1.
3808     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3809 
3810   // Fix the initial value of the original recurrence in the scalar loop.
3811   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3812   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3813   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3814   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3815   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3816     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3817     Start->addIncoming(Incoming, BB);
3818   }
3819 
3820   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3821   Phi->setName("scalar.recur");
3822 
3823   // Finally, fix users of the recurrence outside the loop. The users will need
3824   // either the last value of the scalar recurrence or the last value of the
3825   // vector recurrence we extracted in the middle block. Since the loop is in
3826   // LCSSA form, we just need to find all the phi nodes for the original scalar
3827   // recurrence in the exit block, and then add an edge for the middle block.
3828   // Note that LCSSA does not imply single entry when the original scalar loop
3829   // had multiple exiting edges (as we always run the last iteration in the
3830   // scalar epilogue); in that case, there is no edge from middle to exit and
3831   // and thus no phis which needed updated.
3832   if (!Cost->requiresScalarEpilogue(VF))
3833     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3834       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
3835         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3836         State.Plan->removeLiveOut(&LCSSAPhi);
3837       }
3838 }
3839 
3840 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3841                                        VPTransformState &State) {
3842   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3843   // Get it's reduction variable descriptor.
3844   assert(Legal->isReductionVariable(OrigPhi) &&
3845          "Unable to find the reduction variable");
3846   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3847 
3848   RecurKind RK = RdxDesc.getRecurrenceKind();
3849   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3850   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3851   State.setDebugLocFromInst(ReductionStartValue);
3852 
3853   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3854   // This is the vector-clone of the value that leaves the loop.
3855   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3856 
3857   // Wrap flags are in general invalid after vectorization, clear them.
3858   clearReductionWrapFlags(PhiR, State);
3859 
3860   // Before each round, move the insertion point right between
3861   // the PHIs and the values we are going to write.
3862   // This allows us to write both PHINodes and the extractelement
3863   // instructions.
3864   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3865 
3866   State.setDebugLocFromInst(LoopExitInst);
3867 
3868   Type *PhiTy = OrigPhi->getType();
3869 
3870   VPBasicBlock *LatchVPBB =
3871       PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
3872   BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
3873   // If tail is folded by masking, the vector value to leave the loop should be
3874   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3875   // instead of the former. For an inloop reduction the reduction will already
3876   // be predicated, and does not need to be handled here.
3877   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3878     for (unsigned Part = 0; Part < UF; ++Part) {
3879       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3880       SelectInst *Sel = nullptr;
3881       for (User *U : VecLoopExitInst->users()) {
3882         if (isa<SelectInst>(U)) {
3883           assert(!Sel && "Reduction exit feeding two selects");
3884           Sel = cast<SelectInst>(U);
3885         } else
3886           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3887       }
3888       assert(Sel && "Reduction exit feeds no select");
3889       State.reset(LoopExitInstDef, Sel, Part);
3890 
3891       if (isa<FPMathOperator>(Sel))
3892         Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
3893 
3894       // If the target can create a predicated operator for the reduction at no
3895       // extra cost in the loop (for example a predicated vadd), it can be
3896       // cheaper for the select to remain in the loop than be sunk out of it,
3897       // and so use the select value for the phi instead of the old
3898       // LoopExitValue.
3899       if (PreferPredicatedReductionSelect ||
3900           TTI->preferPredicatedReductionSelect(
3901               RdxDesc.getOpcode(), PhiTy,
3902               TargetTransformInfo::ReductionFlags())) {
3903         auto *VecRdxPhi =
3904             cast<PHINode>(State.get(PhiR, Part));
3905         VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3906       }
3907     }
3908   }
3909 
3910   // If the vector reduction can be performed in a smaller type, we truncate
3911   // then extend the loop exit value to enable InstCombine to evaluate the
3912   // entire expression in the smaller type.
3913   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3914     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
3915     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3916     Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3917     VectorParts RdxParts(UF);
3918     for (unsigned Part = 0; Part < UF; ++Part) {
3919       RdxParts[Part] = State.get(LoopExitInstDef, Part);
3920       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3921       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3922                                         : Builder.CreateZExt(Trunc, VecTy);
3923       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3924         if (U != Trunc) {
3925           U->replaceUsesOfWith(RdxParts[Part], Extnd);
3926           RdxParts[Part] = Extnd;
3927         }
3928     }
3929     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3930     for (unsigned Part = 0; Part < UF; ++Part) {
3931       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3932       State.reset(LoopExitInstDef, RdxParts[Part], Part);
3933     }
3934   }
3935 
3936   // Reduce all of the unrolled parts into a single vector.
3937   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3938   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3939 
3940   // The middle block terminator has already been assigned a DebugLoc here (the
3941   // OrigLoop's single latch terminator). We want the whole middle block to
3942   // appear to execute on this line because: (a) it is all compiler generated,
3943   // (b) these instructions are always executed after evaluating the latch
3944   // conditional branch, and (c) other passes may add new predecessors which
3945   // terminate on this line. This is the easiest way to ensure we don't
3946   // accidentally cause an extra step back into the loop while debugging.
3947   State.setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3948   if (PhiR->isOrdered())
3949     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3950   else {
3951     // Floating-point operations should have some FMF to enable the reduction.
3952     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3953     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3954     for (unsigned Part = 1; Part < UF; ++Part) {
3955       Value *RdxPart = State.get(LoopExitInstDef, Part);
3956       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
3957         ReducedPartRdx = Builder.CreateBinOp(
3958             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3959       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
3960         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
3961                                            ReducedPartRdx, RdxPart);
3962       else
3963         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
3964     }
3965   }
3966 
3967   // Create the reduction after the loop. Note that inloop reductions create the
3968   // target reduction in the loop using a Reduction recipe.
3969   if (VF.isVector() && !PhiR->isInLoop()) {
3970     ReducedPartRdx =
3971         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
3972     // If the reduction can be performed in a smaller type, we need to extend
3973     // the reduction to the wider type before we branch to the original loop.
3974     if (PhiTy != RdxDesc.getRecurrenceType())
3975       ReducedPartRdx = RdxDesc.isSigned()
3976                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
3977                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
3978   }
3979 
3980   PHINode *ResumePhi =
3981       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
3982 
3983   // Create a phi node that merges control-flow from the backedge-taken check
3984   // block and the middle block.
3985   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
3986                                         LoopScalarPreHeader->getTerminator());
3987 
3988   // If we are fixing reductions in the epilogue loop then we should already
3989   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
3990   // we carry over the incoming values correctly.
3991   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
3992     if (Incoming == LoopMiddleBlock)
3993       BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
3994     else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
3995       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
3996                               Incoming);
3997     else
3998       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
3999   }
4000 
4001   // Set the resume value for this reduction
4002   ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4003 
4004   // If there were stores of the reduction value to a uniform memory address
4005   // inside the loop, create the final store here.
4006   if (StoreInst *SI = RdxDesc.IntermediateStore) {
4007     StoreInst *NewSI =
4008         Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
4009     propagateMetadata(NewSI, SI);
4010 
4011     // If the reduction value is used in other places,
4012     // then let the code below create PHI's for that.
4013   }
4014 
4015   // Now, we need to fix the users of the reduction variable
4016   // inside and outside of the scalar remainder loop.
4017 
4018   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4019   // in the exit blocks.  See comment on analogous loop in
4020   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4021   if (!Cost->requiresScalarEpilogue(VF))
4022     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4023       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
4024         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4025         State.Plan->removeLiveOut(&LCSSAPhi);
4026       }
4027 
4028   // Fix the scalar loop reduction variable with the incoming reduction sum
4029   // from the vector body and from the backedge value.
4030   int IncomingEdgeBlockIdx =
4031       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4032   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4033   // Pick the other block.
4034   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4035   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4036   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4037 }
4038 
4039 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
4040                                                   VPTransformState &State) {
4041   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4042   RecurKind RK = RdxDesc.getRecurrenceKind();
4043   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4044     return;
4045 
4046   SmallVector<VPValue *, 8> Worklist;
4047   SmallPtrSet<VPValue *, 8> Visited;
4048   Worklist.push_back(PhiR);
4049   Visited.insert(PhiR);
4050 
4051   while (!Worklist.empty()) {
4052     VPValue *Cur = Worklist.pop_back_val();
4053     for (unsigned Part = 0; Part < UF; ++Part) {
4054       Value *V = State.get(Cur, Part);
4055       if (!isa<OverflowingBinaryOperator>(V))
4056         break;
4057       cast<Instruction>(V)->dropPoisonGeneratingFlags();
4058       }
4059 
4060       for (VPUser *U : Cur->users()) {
4061         auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
4062         if (!UserRecipe)
4063           continue;
4064         for (VPValue *V : UserRecipe->definedValues())
4065           if (Visited.insert(V).second)
4066             Worklist.push_back(V);
4067       }
4068   }
4069 }
4070 
4071 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4072   // The basic block and loop containing the predicated instruction.
4073   auto *PredBB = PredInst->getParent();
4074   auto *VectorLoop = LI->getLoopFor(PredBB);
4075 
4076   // Initialize a worklist with the operands of the predicated instruction.
4077   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4078 
4079   // Holds instructions that we need to analyze again. An instruction may be
4080   // reanalyzed if we don't yet know if we can sink it or not.
4081   SmallVector<Instruction *, 8> InstsToReanalyze;
4082 
4083   // Returns true if a given use occurs in the predicated block. Phi nodes use
4084   // their operands in their corresponding predecessor blocks.
4085   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4086     auto *I = cast<Instruction>(U.getUser());
4087     BasicBlock *BB = I->getParent();
4088     if (auto *Phi = dyn_cast<PHINode>(I))
4089       BB = Phi->getIncomingBlock(
4090           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4091     return BB == PredBB;
4092   };
4093 
4094   // Iteratively sink the scalarized operands of the predicated instruction
4095   // into the block we created for it. When an instruction is sunk, it's
4096   // operands are then added to the worklist. The algorithm ends after one pass
4097   // through the worklist doesn't sink a single instruction.
4098   bool Changed;
4099   do {
4100     // Add the instructions that need to be reanalyzed to the worklist, and
4101     // reset the changed indicator.
4102     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4103     InstsToReanalyze.clear();
4104     Changed = false;
4105 
4106     while (!Worklist.empty()) {
4107       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4108 
4109       // We can't sink an instruction if it is a phi node, is not in the loop,
4110       // or may have side effects.
4111       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4112           I->mayHaveSideEffects())
4113         continue;
4114 
4115       // If the instruction is already in PredBB, check if we can sink its
4116       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4117       // sinking the scalar instruction I, hence it appears in PredBB; but it
4118       // may have failed to sink I's operands (recursively), which we try
4119       // (again) here.
4120       if (I->getParent() == PredBB) {
4121         Worklist.insert(I->op_begin(), I->op_end());
4122         continue;
4123       }
4124 
4125       // It's legal to sink the instruction if all its uses occur in the
4126       // predicated block. Otherwise, there's nothing to do yet, and we may
4127       // need to reanalyze the instruction.
4128       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4129         InstsToReanalyze.push_back(I);
4130         continue;
4131       }
4132 
4133       // Move the instruction to the beginning of the predicated block, and add
4134       // it's operands to the worklist.
4135       I->moveBefore(&*PredBB->getFirstInsertionPt());
4136       Worklist.insert(I->op_begin(), I->op_end());
4137 
4138       // The sinking may have enabled other instructions to be sunk, so we will
4139       // need to iterate.
4140       Changed = true;
4141     }
4142   } while (Changed);
4143 }
4144 
4145 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
4146                                               VPTransformState &State) {
4147   auto Iter = depth_first(
4148       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()));
4149   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4150     for (VPRecipeBase &P : VPBB->phis()) {
4151       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
4152       if (!VPPhi)
4153         continue;
4154       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4155       // Make sure the builder has a valid insert point.
4156       Builder.SetInsertPoint(NewPhi);
4157       for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4158         VPValue *Inc = VPPhi->getIncomingValue(i);
4159         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4160         NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4161       }
4162     }
4163   }
4164 }
4165 
4166 bool InnerLoopVectorizer::useOrderedReductions(
4167     const RecurrenceDescriptor &RdxDesc) {
4168   return Cost->useOrderedReductions(RdxDesc);
4169 }
4170 
4171 /// A helper function for checking whether an integer division-related
4172 /// instruction may divide by zero (in which case it must be predicated if
4173 /// executed conditionally in the scalar code).
4174 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4175 /// Non-zero divisors that are non compile-time constants will not be
4176 /// converted into multiplication, so we will still end up scalarizing
4177 /// the division, but can do so w/o predication.
4178 static bool mayDivideByZero(Instruction &I) {
4179   assert((I.getOpcode() == Instruction::UDiv ||
4180           I.getOpcode() == Instruction::SDiv ||
4181           I.getOpcode() == Instruction::URem ||
4182           I.getOpcode() == Instruction::SRem) &&
4183          "Unexpected instruction");
4184   Value *Divisor = I.getOperand(1);
4185   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4186   return !CInt || CInt->isZero();
4187 }
4188 
4189 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4190                                                VPUser &ArgOperands,
4191                                                VPTransformState &State) {
4192   assert(!isa<DbgInfoIntrinsic>(I) &&
4193          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4194   State.setDebugLocFromInst(&I);
4195 
4196   Module *M = I.getParent()->getParent()->getParent();
4197   auto *CI = cast<CallInst>(&I);
4198 
4199   SmallVector<Type *, 4> Tys;
4200   for (Value *ArgOperand : CI->args())
4201     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4202 
4203   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4204 
4205   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4206   // version of the instruction.
4207   // Is it beneficial to perform intrinsic call compared to lib call?
4208   bool NeedToScalarize = false;
4209   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4210   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4211   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4212   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4213          "Instruction should be scalarized elsewhere.");
4214   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4215          "Either the intrinsic cost or vector call cost must be valid");
4216 
4217   for (unsigned Part = 0; Part < UF; ++Part) {
4218     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4219     SmallVector<Value *, 4> Args;
4220     for (auto &I : enumerate(ArgOperands.operands())) {
4221       // Some intrinsics have a scalar argument - don't replace it with a
4222       // vector.
4223       Value *Arg;
4224       if (!UseVectorIntrinsic ||
4225           !isVectorIntrinsicWithScalarOpAtArg(ID, I.index()))
4226         Arg = State.get(I.value(), Part);
4227       else
4228         Arg = State.get(I.value(), VPIteration(0, 0));
4229       if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index()))
4230         TysForDecl.push_back(Arg->getType());
4231       Args.push_back(Arg);
4232     }
4233 
4234     Function *VectorF;
4235     if (UseVectorIntrinsic) {
4236       // Use vector version of the intrinsic.
4237       if (VF.isVector())
4238         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4239       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4240       assert(VectorF && "Can't retrieve vector intrinsic.");
4241     } else {
4242       // Use vector version of the function call.
4243       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4244 #ifndef NDEBUG
4245       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4246              "Can't create vector function.");
4247 #endif
4248         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4249     }
4250       SmallVector<OperandBundleDef, 1> OpBundles;
4251       CI->getOperandBundlesAsDefs(OpBundles);
4252       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4253 
4254       if (isa<FPMathOperator>(V))
4255         V->copyFastMathFlags(CI);
4256 
4257       State.set(Def, V, Part);
4258       State.addMetadata(V, &I);
4259   }
4260 }
4261 
4262 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4263   // We should not collect Scalars more than once per VF. Right now, this
4264   // function is called from collectUniformsAndScalars(), which already does
4265   // this check. Collecting Scalars for VF=1 does not make any sense.
4266   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4267          "This function should not be visited twice for the same VF");
4268 
4269   // This avoids any chances of creating a REPLICATE recipe during planning
4270   // since that would result in generation of scalarized code during execution,
4271   // which is not supported for scalable vectors.
4272   if (VF.isScalable()) {
4273     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
4274     return;
4275   }
4276 
4277   SmallSetVector<Instruction *, 8> Worklist;
4278 
4279   // These sets are used to seed the analysis with pointers used by memory
4280   // accesses that will remain scalar.
4281   SmallSetVector<Instruction *, 8> ScalarPtrs;
4282   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4283   auto *Latch = TheLoop->getLoopLatch();
4284 
4285   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4286   // The pointer operands of loads and stores will be scalar as long as the
4287   // memory access is not a gather or scatter operation. The value operand of a
4288   // store will remain scalar if the store is scalarized.
4289   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4290     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4291     assert(WideningDecision != CM_Unknown &&
4292            "Widening decision should be ready at this moment");
4293     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4294       if (Ptr == Store->getValueOperand())
4295         return WideningDecision == CM_Scalarize;
4296     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4297            "Ptr is neither a value or pointer operand");
4298     return WideningDecision != CM_GatherScatter;
4299   };
4300 
4301   // A helper that returns true if the given value is a bitcast or
4302   // getelementptr instruction contained in the loop.
4303   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4304     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4305             isa<GetElementPtrInst>(V)) &&
4306            !TheLoop->isLoopInvariant(V);
4307   };
4308 
4309   // A helper that evaluates a memory access's use of a pointer. If the use will
4310   // be a scalar use and the pointer is only used by memory accesses, we place
4311   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4312   // PossibleNonScalarPtrs.
4313   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4314     // We only care about bitcast and getelementptr instructions contained in
4315     // the loop.
4316     if (!isLoopVaryingBitCastOrGEP(Ptr))
4317       return;
4318 
4319     // If the pointer has already been identified as scalar (e.g., if it was
4320     // also identified as uniform), there's nothing to do.
4321     auto *I = cast<Instruction>(Ptr);
4322     if (Worklist.count(I))
4323       return;
4324 
4325     // If the use of the pointer will be a scalar use, and all users of the
4326     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4327     // place the pointer in PossibleNonScalarPtrs.
4328     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4329           return isa<LoadInst>(U) || isa<StoreInst>(U);
4330         }))
4331       ScalarPtrs.insert(I);
4332     else
4333       PossibleNonScalarPtrs.insert(I);
4334   };
4335 
4336   // We seed the scalars analysis with three classes of instructions: (1)
4337   // instructions marked uniform-after-vectorization and (2) bitcast,
4338   // getelementptr and (pointer) phi instructions used by memory accesses
4339   // requiring a scalar use.
4340   //
4341   // (1) Add to the worklist all instructions that have been identified as
4342   // uniform-after-vectorization.
4343   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4344 
4345   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4346   // memory accesses requiring a scalar use. The pointer operands of loads and
4347   // stores will be scalar as long as the memory accesses is not a gather or
4348   // scatter operation. The value operand of a store will remain scalar if the
4349   // store is scalarized.
4350   for (auto *BB : TheLoop->blocks())
4351     for (auto &I : *BB) {
4352       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4353         evaluatePtrUse(Load, Load->getPointerOperand());
4354       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4355         evaluatePtrUse(Store, Store->getPointerOperand());
4356         evaluatePtrUse(Store, Store->getValueOperand());
4357       }
4358     }
4359   for (auto *I : ScalarPtrs)
4360     if (!PossibleNonScalarPtrs.count(I)) {
4361       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4362       Worklist.insert(I);
4363     }
4364 
4365   // Insert the forced scalars.
4366   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
4367   // induction variable when the PHI user is scalarized.
4368   auto ForcedScalar = ForcedScalars.find(VF);
4369   if (ForcedScalar != ForcedScalars.end())
4370     for (auto *I : ForcedScalar->second)
4371       Worklist.insert(I);
4372 
4373   // Expand the worklist by looking through any bitcasts and getelementptr
4374   // instructions we've already identified as scalar. This is similar to the
4375   // expansion step in collectLoopUniforms(); however, here we're only
4376   // expanding to include additional bitcasts and getelementptr instructions.
4377   unsigned Idx = 0;
4378   while (Idx != Worklist.size()) {
4379     Instruction *Dst = Worklist[Idx++];
4380     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4381       continue;
4382     auto *Src = cast<Instruction>(Dst->getOperand(0));
4383     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4384           auto *J = cast<Instruction>(U);
4385           return !TheLoop->contains(J) || Worklist.count(J) ||
4386                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4387                   isScalarUse(J, Src));
4388         })) {
4389       Worklist.insert(Src);
4390       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4391     }
4392   }
4393 
4394   // An induction variable will remain scalar if all users of the induction
4395   // variable and induction variable update remain scalar.
4396   for (auto &Induction : Legal->getInductionVars()) {
4397     auto *Ind = Induction.first;
4398     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4399 
4400     // If tail-folding is applied, the primary induction variable will be used
4401     // to feed a vector compare.
4402     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4403       continue;
4404 
4405     // Returns true if \p Indvar is a pointer induction that is used directly by
4406     // load/store instruction \p I.
4407     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4408                                               Instruction *I) {
4409       return Induction.second.getKind() ==
4410                  InductionDescriptor::IK_PtrInduction &&
4411              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4412              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4413     };
4414 
4415     // Determine if all users of the induction variable are scalar after
4416     // vectorization.
4417     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4418       auto *I = cast<Instruction>(U);
4419       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4420              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4421     });
4422     if (!ScalarInd)
4423       continue;
4424 
4425     // Determine if all users of the induction variable update instruction are
4426     // scalar after vectorization.
4427     auto ScalarIndUpdate =
4428         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4429           auto *I = cast<Instruction>(U);
4430           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4431                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4432         });
4433     if (!ScalarIndUpdate)
4434       continue;
4435 
4436     // The induction variable and its update instruction will remain scalar.
4437     Worklist.insert(Ind);
4438     Worklist.insert(IndUpdate);
4439     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4440     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4441                       << "\n");
4442   }
4443 
4444   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4445 }
4446 
4447 bool LoopVectorizationCostModel::isScalarWithPredication(
4448     Instruction *I, ElementCount VF) const {
4449   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4450     return false;
4451   switch(I->getOpcode()) {
4452   default:
4453     break;
4454   case Instruction::Load:
4455   case Instruction::Store: {
4456     if (!Legal->isMaskRequired(I))
4457       return false;
4458     auto *Ptr = getLoadStorePointerOperand(I);
4459     auto *Ty = getLoadStoreType(I);
4460     Type *VTy = Ty;
4461     if (VF.isVector())
4462       VTy = VectorType::get(Ty, VF);
4463     const Align Alignment = getLoadStoreAlignment(I);
4464     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4465                                 TTI.isLegalMaskedGather(VTy, Alignment))
4466                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4467                                 TTI.isLegalMaskedScatter(VTy, Alignment));
4468   }
4469   case Instruction::UDiv:
4470   case Instruction::SDiv:
4471   case Instruction::SRem:
4472   case Instruction::URem:
4473     return mayDivideByZero(*I);
4474   }
4475   return false;
4476 }
4477 
4478 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4479     Instruction *I, ElementCount VF) {
4480   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4481   assert(getWideningDecision(I, VF) == CM_Unknown &&
4482          "Decision should not be set yet.");
4483   auto *Group = getInterleavedAccessGroup(I);
4484   assert(Group && "Must have a group.");
4485 
4486   // If the instruction's allocated size doesn't equal it's type size, it
4487   // requires padding and will be scalarized.
4488   auto &DL = I->getModule()->getDataLayout();
4489   auto *ScalarTy = getLoadStoreType(I);
4490   if (hasIrregularType(ScalarTy, DL))
4491     return false;
4492 
4493   // If the group involves a non-integral pointer, we may not be able to
4494   // losslessly cast all values to a common type.
4495   unsigned InterleaveFactor = Group->getFactor();
4496   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4497   for (unsigned i = 0; i < InterleaveFactor; i++) {
4498     Instruction *Member = Group->getMember(i);
4499     if (!Member)
4500       continue;
4501     auto *MemberTy = getLoadStoreType(Member);
4502     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4503     // Don't coerce non-integral pointers to integers or vice versa.
4504     if (MemberNI != ScalarNI) {
4505       // TODO: Consider adding special nullptr value case here
4506       return false;
4507     } else if (MemberNI && ScalarNI &&
4508                ScalarTy->getPointerAddressSpace() !=
4509                MemberTy->getPointerAddressSpace()) {
4510       return false;
4511     }
4512   }
4513 
4514   // Check if masking is required.
4515   // A Group may need masking for one of two reasons: it resides in a block that
4516   // needs predication, or it was decided to use masking to deal with gaps
4517   // (either a gap at the end of a load-access that may result in a speculative
4518   // load, or any gaps in a store-access).
4519   bool PredicatedAccessRequiresMasking =
4520       blockNeedsPredicationForAnyReason(I->getParent()) &&
4521       Legal->isMaskRequired(I);
4522   bool LoadAccessWithGapsRequiresEpilogMasking =
4523       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4524       !isScalarEpilogueAllowed();
4525   bool StoreAccessWithGapsRequiresMasking =
4526       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4527   if (!PredicatedAccessRequiresMasking &&
4528       !LoadAccessWithGapsRequiresEpilogMasking &&
4529       !StoreAccessWithGapsRequiresMasking)
4530     return true;
4531 
4532   // If masked interleaving is required, we expect that the user/target had
4533   // enabled it, because otherwise it either wouldn't have been created or
4534   // it should have been invalidated by the CostModel.
4535   assert(useMaskedInterleavedAccesses(TTI) &&
4536          "Masked interleave-groups for predicated accesses are not enabled.");
4537 
4538   if (Group->isReverse())
4539     return false;
4540 
4541   auto *Ty = getLoadStoreType(I);
4542   const Align Alignment = getLoadStoreAlignment(I);
4543   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4544                           : TTI.isLegalMaskedStore(Ty, Alignment);
4545 }
4546 
4547 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4548     Instruction *I, ElementCount VF) {
4549   // Get and ensure we have a valid memory instruction.
4550   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4551 
4552   auto *Ptr = getLoadStorePointerOperand(I);
4553   auto *ScalarTy = getLoadStoreType(I);
4554 
4555   // In order to be widened, the pointer should be consecutive, first of all.
4556   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4557     return false;
4558 
4559   // If the instruction is a store located in a predicated block, it will be
4560   // scalarized.
4561   if (isScalarWithPredication(I, VF))
4562     return false;
4563 
4564   // If the instruction's allocated size doesn't equal it's type size, it
4565   // requires padding and will be scalarized.
4566   auto &DL = I->getModule()->getDataLayout();
4567   if (hasIrregularType(ScalarTy, DL))
4568     return false;
4569 
4570   return true;
4571 }
4572 
4573 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4574   // We should not collect Uniforms more than once per VF. Right now,
4575   // this function is called from collectUniformsAndScalars(), which
4576   // already does this check. Collecting Uniforms for VF=1 does not make any
4577   // sense.
4578 
4579   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4580          "This function should not be visited twice for the same VF");
4581 
4582   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4583   // not analyze again.  Uniforms.count(VF) will return 1.
4584   Uniforms[VF].clear();
4585 
4586   // We now know that the loop is vectorizable!
4587   // Collect instructions inside the loop that will remain uniform after
4588   // vectorization.
4589 
4590   // Global values, params and instructions outside of current loop are out of
4591   // scope.
4592   auto isOutOfScope = [&](Value *V) -> bool {
4593     Instruction *I = dyn_cast<Instruction>(V);
4594     return (!I || !TheLoop->contains(I));
4595   };
4596 
4597   // Worklist containing uniform instructions demanding lane 0.
4598   SetVector<Instruction *> Worklist;
4599   BasicBlock *Latch = TheLoop->getLoopLatch();
4600 
4601   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4602   // that are scalar with predication must not be considered uniform after
4603   // vectorization, because that would create an erroneous replicating region
4604   // where only a single instance out of VF should be formed.
4605   // TODO: optimize such seldom cases if found important, see PR40816.
4606   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4607     if (isOutOfScope(I)) {
4608       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4609                         << *I << "\n");
4610       return;
4611     }
4612     if (isScalarWithPredication(I, VF)) {
4613       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4614                         << *I << "\n");
4615       return;
4616     }
4617     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4618     Worklist.insert(I);
4619   };
4620 
4621   // Start with the conditional branch. If the branch condition is an
4622   // instruction contained in the loop that is only used by the branch, it is
4623   // uniform.
4624   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4625   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4626     addToWorklistIfAllowed(Cmp);
4627 
4628   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4629     InstWidening WideningDecision = getWideningDecision(I, VF);
4630     assert(WideningDecision != CM_Unknown &&
4631            "Widening decision should be ready at this moment");
4632 
4633     // A uniform memory op is itself uniform.  We exclude uniform stores
4634     // here as they demand the last lane, not the first one.
4635     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
4636       assert(WideningDecision == CM_Scalarize);
4637       return true;
4638     }
4639 
4640     return (WideningDecision == CM_Widen ||
4641             WideningDecision == CM_Widen_Reverse ||
4642             WideningDecision == CM_Interleave);
4643   };
4644 
4645 
4646   // Returns true if Ptr is the pointer operand of a memory access instruction
4647   // I, and I is known to not require scalarization.
4648   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4649     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4650   };
4651 
4652   // Holds a list of values which are known to have at least one uniform use.
4653   // Note that there may be other uses which aren't uniform.  A "uniform use"
4654   // here is something which only demands lane 0 of the unrolled iterations;
4655   // it does not imply that all lanes produce the same value (e.g. this is not
4656   // the usual meaning of uniform)
4657   SetVector<Value *> HasUniformUse;
4658 
4659   // Scan the loop for instructions which are either a) known to have only
4660   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4661   for (auto *BB : TheLoop->blocks())
4662     for (auto &I : *BB) {
4663       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4664         switch (II->getIntrinsicID()) {
4665         case Intrinsic::sideeffect:
4666         case Intrinsic::experimental_noalias_scope_decl:
4667         case Intrinsic::assume:
4668         case Intrinsic::lifetime_start:
4669         case Intrinsic::lifetime_end:
4670           if (TheLoop->hasLoopInvariantOperands(&I))
4671             addToWorklistIfAllowed(&I);
4672           break;
4673         default:
4674           break;
4675         }
4676       }
4677 
4678       // ExtractValue instructions must be uniform, because the operands are
4679       // known to be loop-invariant.
4680       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4681         assert(isOutOfScope(EVI->getAggregateOperand()) &&
4682                "Expected aggregate value to be loop invariant");
4683         addToWorklistIfAllowed(EVI);
4684         continue;
4685       }
4686 
4687       // If there's no pointer operand, there's nothing to do.
4688       auto *Ptr = getLoadStorePointerOperand(&I);
4689       if (!Ptr)
4690         continue;
4691 
4692       // A uniform memory op is itself uniform.  We exclude uniform stores
4693       // here as they demand the last lane, not the first one.
4694       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
4695         addToWorklistIfAllowed(&I);
4696 
4697       if (isUniformDecision(&I, VF)) {
4698         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
4699         HasUniformUse.insert(Ptr);
4700       }
4701     }
4702 
4703   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4704   // demanding) users.  Since loops are assumed to be in LCSSA form, this
4705   // disallows uses outside the loop as well.
4706   for (auto *V : HasUniformUse) {
4707     if (isOutOfScope(V))
4708       continue;
4709     auto *I = cast<Instruction>(V);
4710     auto UsersAreMemAccesses =
4711       llvm::all_of(I->users(), [&](User *U) -> bool {
4712         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4713       });
4714     if (UsersAreMemAccesses)
4715       addToWorklistIfAllowed(I);
4716   }
4717 
4718   // Expand Worklist in topological order: whenever a new instruction
4719   // is added , its users should be already inside Worklist.  It ensures
4720   // a uniform instruction will only be used by uniform instructions.
4721   unsigned idx = 0;
4722   while (idx != Worklist.size()) {
4723     Instruction *I = Worklist[idx++];
4724 
4725     for (auto OV : I->operand_values()) {
4726       // isOutOfScope operands cannot be uniform instructions.
4727       if (isOutOfScope(OV))
4728         continue;
4729       // First order recurrence Phi's should typically be considered
4730       // non-uniform.
4731       auto *OP = dyn_cast<PHINode>(OV);
4732       if (OP && Legal->isFirstOrderRecurrence(OP))
4733         continue;
4734       // If all the users of the operand are uniform, then add the
4735       // operand into the uniform worklist.
4736       auto *OI = cast<Instruction>(OV);
4737       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4738             auto *J = cast<Instruction>(U);
4739             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4740           }))
4741         addToWorklistIfAllowed(OI);
4742     }
4743   }
4744 
4745   // For an instruction to be added into Worklist above, all its users inside
4746   // the loop should also be in Worklist. However, this condition cannot be
4747   // true for phi nodes that form a cyclic dependence. We must process phi
4748   // nodes separately. An induction variable will remain uniform if all users
4749   // of the induction variable and induction variable update remain uniform.
4750   // The code below handles both pointer and non-pointer induction variables.
4751   for (auto &Induction : Legal->getInductionVars()) {
4752     auto *Ind = Induction.first;
4753     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4754 
4755     // Determine if all users of the induction variable are uniform after
4756     // vectorization.
4757     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4758       auto *I = cast<Instruction>(U);
4759       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4760              isVectorizedMemAccessUse(I, Ind);
4761     });
4762     if (!UniformInd)
4763       continue;
4764 
4765     // Determine if all users of the induction variable update instruction are
4766     // uniform after vectorization.
4767     auto UniformIndUpdate =
4768         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4769           auto *I = cast<Instruction>(U);
4770           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4771                  isVectorizedMemAccessUse(I, IndUpdate);
4772         });
4773     if (!UniformIndUpdate)
4774       continue;
4775 
4776     // The induction variable and its update instruction will remain uniform.
4777     addToWorklistIfAllowed(Ind);
4778     addToWorklistIfAllowed(IndUpdate);
4779   }
4780 
4781   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4782 }
4783 
4784 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4785   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4786 
4787   if (Legal->getRuntimePointerChecking()->Need) {
4788     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4789         "runtime pointer checks needed. Enable vectorization of this "
4790         "loop with '#pragma clang loop vectorize(enable)' when "
4791         "compiling with -Os/-Oz",
4792         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4793     return true;
4794   }
4795 
4796   if (!PSE.getPredicate().isAlwaysTrue()) {
4797     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4798         "runtime SCEV checks needed. Enable vectorization of this "
4799         "loop with '#pragma clang loop vectorize(enable)' when "
4800         "compiling with -Os/-Oz",
4801         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4802     return true;
4803   }
4804 
4805   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4806   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4807     reportVectorizationFailure("Runtime stride check for small trip count",
4808         "runtime stride == 1 checks needed. Enable vectorization of "
4809         "this loop without such check by compiling with -Os/-Oz",
4810         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4811     return true;
4812   }
4813 
4814   return false;
4815 }
4816 
4817 ElementCount
4818 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4819   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4820     return ElementCount::getScalable(0);
4821 
4822   if (Hints->isScalableVectorizationDisabled()) {
4823     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4824                             "ScalableVectorizationDisabled", ORE, TheLoop);
4825     return ElementCount::getScalable(0);
4826   }
4827 
4828   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4829 
4830   auto MaxScalableVF = ElementCount::getScalable(
4831       std::numeric_limits<ElementCount::ScalarTy>::max());
4832 
4833   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4834   // FIXME: While for scalable vectors this is currently sufficient, this should
4835   // be replaced by a more detailed mechanism that filters out specific VFs,
4836   // instead of invalidating vectorization for a whole set of VFs based on the
4837   // MaxVF.
4838 
4839   // Disable scalable vectorization if the loop contains unsupported reductions.
4840   if (!canVectorizeReductions(MaxScalableVF)) {
4841     reportVectorizationInfo(
4842         "Scalable vectorization not supported for the reduction "
4843         "operations found in this loop.",
4844         "ScalableVFUnfeasible", ORE, TheLoop);
4845     return ElementCount::getScalable(0);
4846   }
4847 
4848   // Disable scalable vectorization if the loop contains any instructions
4849   // with element types not supported for scalable vectors.
4850   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4851         return !Ty->isVoidTy() &&
4852                !this->TTI.isElementTypeLegalForScalableVector(Ty);
4853       })) {
4854     reportVectorizationInfo("Scalable vectorization is not supported "
4855                             "for all element types found in this loop.",
4856                             "ScalableVFUnfeasible", ORE, TheLoop);
4857     return ElementCount::getScalable(0);
4858   }
4859 
4860   if (Legal->isSafeForAnyVectorWidth())
4861     return MaxScalableVF;
4862 
4863   // Limit MaxScalableVF by the maximum safe dependence distance.
4864   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
4865   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
4866     MaxVScale =
4867         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
4868   MaxScalableVF = ElementCount::getScalable(
4869       MaxVScale ? (MaxSafeElements / MaxVScale.value()) : 0);
4870   if (!MaxScalableVF)
4871     reportVectorizationInfo(
4872         "Max legal vector width too small, scalable vectorization "
4873         "unfeasible.",
4874         "ScalableVFUnfeasible", ORE, TheLoop);
4875 
4876   return MaxScalableVF;
4877 }
4878 
4879 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4880     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4881   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4882   unsigned SmallestType, WidestType;
4883   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4884 
4885   // Get the maximum safe dependence distance in bits computed by LAA.
4886   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4887   // the memory accesses that is most restrictive (involved in the smallest
4888   // dependence distance).
4889   unsigned MaxSafeElements =
4890       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4891 
4892   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4893   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4894 
4895   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4896                     << ".\n");
4897   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4898                     << ".\n");
4899 
4900   // First analyze the UserVF, fall back if the UserVF should be ignored.
4901   if (UserVF) {
4902     auto MaxSafeUserVF =
4903         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4904 
4905     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4906       // If `VF=vscale x N` is safe, then so is `VF=N`
4907       if (UserVF.isScalable())
4908         return FixedScalableVFPair(
4909             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4910       else
4911         return UserVF;
4912     }
4913 
4914     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4915 
4916     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4917     // is better to ignore the hint and let the compiler choose a suitable VF.
4918     if (!UserVF.isScalable()) {
4919       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4920                         << " is unsafe, clamping to max safe VF="
4921                         << MaxSafeFixedVF << ".\n");
4922       ORE->emit([&]() {
4923         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4924                                           TheLoop->getStartLoc(),
4925                                           TheLoop->getHeader())
4926                << "User-specified vectorization factor "
4927                << ore::NV("UserVectorizationFactor", UserVF)
4928                << " is unsafe, clamping to maximum safe vectorization factor "
4929                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4930       });
4931       return MaxSafeFixedVF;
4932     }
4933 
4934     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4935       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4936                         << " is ignored because scalable vectors are not "
4937                            "available.\n");
4938       ORE->emit([&]() {
4939         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4940                                           TheLoop->getStartLoc(),
4941                                           TheLoop->getHeader())
4942                << "User-specified vectorization factor "
4943                << ore::NV("UserVectorizationFactor", UserVF)
4944                << " is ignored because the target does not support scalable "
4945                   "vectors. The compiler will pick a more suitable value.";
4946       });
4947     } else {
4948       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4949                         << " is unsafe. Ignoring scalable UserVF.\n");
4950       ORE->emit([&]() {
4951         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4952                                           TheLoop->getStartLoc(),
4953                                           TheLoop->getHeader())
4954                << "User-specified vectorization factor "
4955                << ore::NV("UserVectorizationFactor", UserVF)
4956                << " is unsafe. Ignoring the hint to let the compiler pick a "
4957                   "more suitable value.";
4958       });
4959     }
4960   }
4961 
4962   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4963                     << " / " << WidestType << " bits.\n");
4964 
4965   FixedScalableVFPair Result(ElementCount::getFixed(1),
4966                              ElementCount::getScalable(0));
4967   if (auto MaxVF =
4968           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
4969                                   MaxSafeFixedVF, FoldTailByMasking))
4970     Result.FixedVF = MaxVF;
4971 
4972   if (auto MaxVF =
4973           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
4974                                   MaxSafeScalableVF, FoldTailByMasking))
4975     if (MaxVF.isScalable()) {
4976       Result.ScalableVF = MaxVF;
4977       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4978                         << "\n");
4979     }
4980 
4981   return Result;
4982 }
4983 
4984 FixedScalableVFPair
4985 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4986   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4987     // TODO: It may by useful to do since it's still likely to be dynamically
4988     // uniform if the target can skip.
4989     reportVectorizationFailure(
4990         "Not inserting runtime ptr check for divergent target",
4991         "runtime pointer checks needed. Not enabled for divergent target",
4992         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4993     return FixedScalableVFPair::getNone();
4994   }
4995 
4996   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4997   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4998   if (TC == 1) {
4999     reportVectorizationFailure("Single iteration (non) loop",
5000         "loop trip count is one, irrelevant for vectorization",
5001         "SingleIterationLoop", ORE, TheLoop);
5002     return FixedScalableVFPair::getNone();
5003   }
5004 
5005   switch (ScalarEpilogueStatus) {
5006   case CM_ScalarEpilogueAllowed:
5007     return computeFeasibleMaxVF(TC, UserVF, false);
5008   case CM_ScalarEpilogueNotAllowedUsePredicate:
5009     LLVM_FALLTHROUGH;
5010   case CM_ScalarEpilogueNotNeededUsePredicate:
5011     LLVM_DEBUG(
5012         dbgs() << "LV: vector predicate hint/switch found.\n"
5013                << "LV: Not allowing scalar epilogue, creating predicated "
5014                << "vector loop.\n");
5015     break;
5016   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5017     // fallthrough as a special case of OptForSize
5018   case CM_ScalarEpilogueNotAllowedOptSize:
5019     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5020       LLVM_DEBUG(
5021           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5022     else
5023       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5024                         << "count.\n");
5025 
5026     // Bail if runtime checks are required, which are not good when optimising
5027     // for size.
5028     if (runtimeChecksRequired())
5029       return FixedScalableVFPair::getNone();
5030 
5031     break;
5032   }
5033 
5034   // The only loops we can vectorize without a scalar epilogue, are loops with
5035   // a bottom-test and a single exiting block. We'd have to handle the fact
5036   // that not every instruction executes on the last iteration.  This will
5037   // require a lane mask which varies through the vector loop body.  (TODO)
5038   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5039     // If there was a tail-folding hint/switch, but we can't fold the tail by
5040     // masking, fallback to a vectorization with a scalar epilogue.
5041     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5042       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5043                            "scalar epilogue instead.\n");
5044       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5045       return computeFeasibleMaxVF(TC, UserVF, false);
5046     }
5047     return FixedScalableVFPair::getNone();
5048   }
5049 
5050   // Now try the tail folding
5051 
5052   // Invalidate interleave groups that require an epilogue if we can't mask
5053   // the interleave-group.
5054   if (!useMaskedInterleavedAccesses(TTI)) {
5055     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5056            "No decisions should have been taken at this point");
5057     // Note: There is no need to invalidate any cost modeling decisions here, as
5058     // non where taken so far.
5059     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5060   }
5061 
5062   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5063   // Avoid tail folding if the trip count is known to be a multiple of any VF
5064   // we chose.
5065   // FIXME: The condition below pessimises the case for fixed-width vectors,
5066   // when scalable VFs are also candidates for vectorization.
5067   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5068     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5069     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5070            "MaxFixedVF must be a power of 2");
5071     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5072                                    : MaxFixedVF.getFixedValue();
5073     ScalarEvolution *SE = PSE.getSE();
5074     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5075     const SCEV *ExitCount = SE->getAddExpr(
5076         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5077     const SCEV *Rem = SE->getURemExpr(
5078         SE->applyLoopGuards(ExitCount, TheLoop),
5079         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5080     if (Rem->isZero()) {
5081       // Accept MaxFixedVF if we do not have a tail.
5082       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5083       return MaxFactors;
5084     }
5085   }
5086 
5087   // If we don't know the precise trip count, or if the trip count that we
5088   // found modulo the vectorization factor is not zero, try to fold the tail
5089   // by masking.
5090   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5091   if (Legal->prepareToFoldTailByMasking()) {
5092     FoldTailByMasking = true;
5093     return MaxFactors;
5094   }
5095 
5096   // If there was a tail-folding hint/switch, but we can't fold the tail by
5097   // masking, fallback to a vectorization with a scalar epilogue.
5098   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5099     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5100                          "scalar epilogue instead.\n");
5101     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5102     return MaxFactors;
5103   }
5104 
5105   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5106     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5107     return FixedScalableVFPair::getNone();
5108   }
5109 
5110   if (TC == 0) {
5111     reportVectorizationFailure(
5112         "Unable to calculate the loop count due to complex control flow",
5113         "unable to calculate the loop count due to complex control flow",
5114         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5115     return FixedScalableVFPair::getNone();
5116   }
5117 
5118   reportVectorizationFailure(
5119       "Cannot optimize for size and vectorize at the same time.",
5120       "cannot optimize for size and vectorize at the same time. "
5121       "Enable vectorization of this loop with '#pragma clang loop "
5122       "vectorize(enable)' when compiling with -Os/-Oz",
5123       "NoTailLoopWithOptForSize", ORE, TheLoop);
5124   return FixedScalableVFPair::getNone();
5125 }
5126 
5127 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5128     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5129     ElementCount MaxSafeVF, bool FoldTailByMasking) {
5130   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5131   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5132       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5133                            : TargetTransformInfo::RGK_FixedWidthVector);
5134 
5135   // Convenience function to return the minimum of two ElementCounts.
5136   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5137     assert((LHS.isScalable() == RHS.isScalable()) &&
5138            "Scalable flags must match");
5139     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5140   };
5141 
5142   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5143   // Note that both WidestRegister and WidestType may not be a powers of 2.
5144   auto MaxVectorElementCount = ElementCount::get(
5145       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5146       ComputeScalableMaxVF);
5147   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5148   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5149                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5150 
5151   if (!MaxVectorElementCount) {
5152     LLVM_DEBUG(dbgs() << "LV: The target has no "
5153                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5154                       << " vector registers.\n");
5155     return ElementCount::getFixed(1);
5156   }
5157 
5158   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5159   if (ConstTripCount &&
5160       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5161       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5162     // If loop trip count (TC) is known at compile time there is no point in
5163     // choosing VF greater than TC (as done in the loop below). Select maximum
5164     // power of two which doesn't exceed TC.
5165     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5166     // when the TC is less than or equal to the known number of lanes.
5167     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5168     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5169                          "exceeding the constant trip count: "
5170                       << ClampedConstTripCount << "\n");
5171     return ElementCount::getFixed(ClampedConstTripCount);
5172   }
5173 
5174   TargetTransformInfo::RegisterKind RegKind =
5175       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5176                            : TargetTransformInfo::RGK_FixedWidthVector;
5177   ElementCount MaxVF = MaxVectorElementCount;
5178   if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
5179                             TTI.shouldMaximizeVectorBandwidth(RegKind))) {
5180     auto MaxVectorElementCountMaxBW = ElementCount::get(
5181         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5182         ComputeScalableMaxVF);
5183     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5184 
5185     // Collect all viable vectorization factors larger than the default MaxVF
5186     // (i.e. MaxVectorElementCount).
5187     SmallVector<ElementCount, 8> VFs;
5188     for (ElementCount VS = MaxVectorElementCount * 2;
5189          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5190       VFs.push_back(VS);
5191 
5192     // For each VF calculate its register usage.
5193     auto RUs = calculateRegisterUsage(VFs);
5194 
5195     // Select the largest VF which doesn't require more registers than existing
5196     // ones.
5197     for (int i = RUs.size() - 1; i >= 0; --i) {
5198       bool Selected = true;
5199       for (auto &pair : RUs[i].MaxLocalUsers) {
5200         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5201         if (pair.second > TargetNumRegisters)
5202           Selected = false;
5203       }
5204       if (Selected) {
5205         MaxVF = VFs[i];
5206         break;
5207       }
5208     }
5209     if (ElementCount MinVF =
5210             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5211       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5212         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5213                           << ") with target's minimum: " << MinVF << '\n');
5214         MaxVF = MinVF;
5215       }
5216     }
5217 
5218     // Invalidate any widening decisions we might have made, in case the loop
5219     // requires prediction (decided later), but we have already made some
5220     // load/store widening decisions.
5221     invalidateCostModelingDecisions();
5222   }
5223   return MaxVF;
5224 }
5225 
5226 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5227   if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5228     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5229     auto Min = Attr.getVScaleRangeMin();
5230     auto Max = Attr.getVScaleRangeMax();
5231     if (Max && Min == Max)
5232       return Max;
5233   }
5234 
5235   return TTI.getVScaleForTuning();
5236 }
5237 
5238 bool LoopVectorizationCostModel::isMoreProfitable(
5239     const VectorizationFactor &A, const VectorizationFactor &B) const {
5240   InstructionCost CostA = A.Cost;
5241   InstructionCost CostB = B.Cost;
5242 
5243   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5244 
5245   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5246       MaxTripCount) {
5247     // If we are folding the tail and the trip count is a known (possibly small)
5248     // constant, the trip count will be rounded up to an integer number of
5249     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5250     // which we compare directly. When not folding the tail, the total cost will
5251     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5252     // approximated with the per-lane cost below instead of using the tripcount
5253     // as here.
5254     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5255     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5256     return RTCostA < RTCostB;
5257   }
5258 
5259   // Improve estimate for the vector width if it is scalable.
5260   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5261   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5262   if (Optional<unsigned> VScale = getVScaleForTuning()) {
5263     if (A.Width.isScalable())
5264       EstimatedWidthA *= VScale.value();
5265     if (B.Width.isScalable())
5266       EstimatedWidthB *= VScale.value();
5267   }
5268 
5269   // Assume vscale may be larger than 1 (or the value being tuned for),
5270   // so that scalable vectorization is slightly favorable over fixed-width
5271   // vectorization.
5272   if (A.Width.isScalable() && !B.Width.isScalable())
5273     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5274 
5275   // To avoid the need for FP division:
5276   //      (CostA / A.Width) < (CostB / B.Width)
5277   // <=>  (CostA * B.Width) < (CostB * A.Width)
5278   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5279 }
5280 
5281 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5282     const ElementCountSet &VFCandidates) {
5283   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5284   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5285   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5286   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5287          "Expected Scalar VF to be a candidate");
5288 
5289   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
5290                                        ExpectedCost);
5291   VectorizationFactor ChosenFactor = ScalarCost;
5292 
5293   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5294   if (ForceVectorization && VFCandidates.size() > 1) {
5295     // Ignore scalar width, because the user explicitly wants vectorization.
5296     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5297     // evaluation.
5298     ChosenFactor.Cost = InstructionCost::getMax();
5299   }
5300 
5301   SmallVector<InstructionVFPair> InvalidCosts;
5302   for (const auto &i : VFCandidates) {
5303     // The cost for scalar VF=1 is already calculated, so ignore it.
5304     if (i.isScalar())
5305       continue;
5306 
5307     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5308     VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5309 
5310 #ifndef NDEBUG
5311     unsigned AssumedMinimumVscale = 1;
5312     if (Optional<unsigned> VScale = getVScaleForTuning())
5313       AssumedMinimumVscale = *VScale;
5314     unsigned Width =
5315         Candidate.Width.isScalable()
5316             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5317             : Candidate.Width.getFixedValue();
5318     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5319                       << " costs: " << (Candidate.Cost / Width));
5320     if (i.isScalable())
5321       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5322                         << AssumedMinimumVscale << ")");
5323     LLVM_DEBUG(dbgs() << ".\n");
5324 #endif
5325 
5326     if (!C.second && !ForceVectorization) {
5327       LLVM_DEBUG(
5328           dbgs() << "LV: Not considering vector loop of width " << i
5329                  << " because it will not generate any vector instructions.\n");
5330       continue;
5331     }
5332 
5333     // If profitable add it to ProfitableVF list.
5334     if (isMoreProfitable(Candidate, ScalarCost))
5335       ProfitableVFs.push_back(Candidate);
5336 
5337     if (isMoreProfitable(Candidate, ChosenFactor))
5338       ChosenFactor = Candidate;
5339   }
5340 
5341   // Emit a report of VFs with invalid costs in the loop.
5342   if (!InvalidCosts.empty()) {
5343     // Group the remarks per instruction, keeping the instruction order from
5344     // InvalidCosts.
5345     std::map<Instruction *, unsigned> Numbering;
5346     unsigned I = 0;
5347     for (auto &Pair : InvalidCosts)
5348       if (!Numbering.count(Pair.first))
5349         Numbering[Pair.first] = I++;
5350 
5351     // Sort the list, first on instruction(number) then on VF.
5352     llvm::sort(InvalidCosts,
5353                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5354                  if (Numbering[A.first] != Numbering[B.first])
5355                    return Numbering[A.first] < Numbering[B.first];
5356                  ElementCountComparator ECC;
5357                  return ECC(A.second, B.second);
5358                });
5359 
5360     // For a list of ordered instruction-vf pairs:
5361     //   [(load, vf1), (load, vf2), (store, vf1)]
5362     // Group the instructions together to emit separate remarks for:
5363     //   load  (vf1, vf2)
5364     //   store (vf1)
5365     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5366     auto Subset = ArrayRef<InstructionVFPair>();
5367     do {
5368       if (Subset.empty())
5369         Subset = Tail.take_front(1);
5370 
5371       Instruction *I = Subset.front().first;
5372 
5373       // If the next instruction is different, or if there are no other pairs,
5374       // emit a remark for the collated subset. e.g.
5375       //   [(load, vf1), (load, vf2))]
5376       // to emit:
5377       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5378       if (Subset == Tail || Tail[Subset.size()].first != I) {
5379         std::string OutString;
5380         raw_string_ostream OS(OutString);
5381         assert(!Subset.empty() && "Unexpected empty range");
5382         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5383         for (auto &Pair : Subset)
5384           OS << (Pair.second == Subset.front().second ? "" : ", ")
5385              << Pair.second;
5386         OS << "):";
5387         if (auto *CI = dyn_cast<CallInst>(I))
5388           OS << " call to " << CI->getCalledFunction()->getName();
5389         else
5390           OS << " " << I->getOpcodeName();
5391         OS.flush();
5392         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5393         Tail = Tail.drop_front(Subset.size());
5394         Subset = {};
5395       } else
5396         // Grow the subset by one element
5397         Subset = Tail.take_front(Subset.size() + 1);
5398     } while (!Tail.empty());
5399   }
5400 
5401   if (!EnableCondStoresVectorization && NumPredStores) {
5402     reportVectorizationFailure("There are conditional stores.",
5403         "store that is conditionally executed prevents vectorization",
5404         "ConditionalStore", ORE, TheLoop);
5405     ChosenFactor = ScalarCost;
5406   }
5407 
5408   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5409                  ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
5410              << "LV: Vectorization seems to be not beneficial, "
5411              << "but was forced by a user.\n");
5412   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5413   return ChosenFactor;
5414 }
5415 
5416 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5417     const Loop &L, ElementCount VF) const {
5418   // Cross iteration phis such as reductions need special handling and are
5419   // currently unsupported.
5420   if (any_of(L.getHeader()->phis(),
5421              [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); }))
5422     return false;
5423 
5424   // Phis with uses outside of the loop require special handling and are
5425   // currently unsupported.
5426   for (auto &Entry : Legal->getInductionVars()) {
5427     // Look for uses of the value of the induction at the last iteration.
5428     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5429     for (User *U : PostInc->users())
5430       if (!L.contains(cast<Instruction>(U)))
5431         return false;
5432     // Look for uses of penultimate value of the induction.
5433     for (User *U : Entry.first->users())
5434       if (!L.contains(cast<Instruction>(U)))
5435         return false;
5436   }
5437 
5438   // Induction variables that are widened require special handling that is
5439   // currently not supported.
5440   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5441         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5442                  this->isProfitableToScalarize(Entry.first, VF));
5443       }))
5444     return false;
5445 
5446   // Epilogue vectorization code has not been auditted to ensure it handles
5447   // non-latch exits properly.  It may be fine, but it needs auditted and
5448   // tested.
5449   if (L.getExitingBlock() != L.getLoopLatch())
5450     return false;
5451 
5452   return true;
5453 }
5454 
5455 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5456     const ElementCount VF) const {
5457   // FIXME: We need a much better cost-model to take different parameters such
5458   // as register pressure, code size increase and cost of extra branches into
5459   // account. For now we apply a very crude heuristic and only consider loops
5460   // with vectorization factors larger than a certain value.
5461   // We also consider epilogue vectorization unprofitable for targets that don't
5462   // consider interleaving beneficial (eg. MVE).
5463   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5464     return false;
5465   // FIXME: We should consider changing the threshold for scalable
5466   // vectors to take VScaleForTuning into account.
5467   if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
5468     return true;
5469   return false;
5470 }
5471 
5472 VectorizationFactor
5473 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5474     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5475   VectorizationFactor Result = VectorizationFactor::Disabled();
5476   if (!EnableEpilogueVectorization) {
5477     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5478     return Result;
5479   }
5480 
5481   if (!isScalarEpilogueAllowed()) {
5482     LLVM_DEBUG(
5483         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5484                   "allowed.\n";);
5485     return Result;
5486   }
5487 
5488   // Not really a cost consideration, but check for unsupported cases here to
5489   // simplify the logic.
5490   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5491     LLVM_DEBUG(
5492         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5493                   "not a supported candidate.\n";);
5494     return Result;
5495   }
5496 
5497   if (EpilogueVectorizationForceVF > 1) {
5498     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5499     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5500     if (LVP.hasPlanWithVF(ForcedEC))
5501       return {ForcedEC, 0, 0};
5502     else {
5503       LLVM_DEBUG(
5504           dbgs()
5505               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5506       return Result;
5507     }
5508   }
5509 
5510   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5511       TheLoop->getHeader()->getParent()->hasMinSize()) {
5512     LLVM_DEBUG(
5513         dbgs()
5514             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5515     return Result;
5516   }
5517 
5518   if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
5519     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5520                          "this loop\n");
5521     return Result;
5522   }
5523 
5524   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5525   // the main loop handles 8 lanes per iteration. We could still benefit from
5526   // vectorizing the epilogue loop with VF=4.
5527   ElementCount EstimatedRuntimeVF = MainLoopVF;
5528   if (MainLoopVF.isScalable()) {
5529     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5530     if (Optional<unsigned> VScale = getVScaleForTuning())
5531       EstimatedRuntimeVF *= *VScale;
5532   }
5533 
5534   for (auto &NextVF : ProfitableVFs)
5535     if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5536           ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
5537          ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
5538         (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
5539         LVP.hasPlanWithVF(NextVF.Width))
5540       Result = NextVF;
5541 
5542   if (Result != VectorizationFactor::Disabled())
5543     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5544                       << Result.Width << "\n";);
5545   return Result;
5546 }
5547 
5548 std::pair<unsigned, unsigned>
5549 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5550   unsigned MinWidth = -1U;
5551   unsigned MaxWidth = 8;
5552   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5553   // For in-loop reductions, no element types are added to ElementTypesInLoop
5554   // if there are no loads/stores in the loop. In this case, check through the
5555   // reduction variables to determine the maximum width.
5556   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5557     // Reset MaxWidth so that we can find the smallest type used by recurrences
5558     // in the loop.
5559     MaxWidth = -1U;
5560     for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
5561       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5562       // When finding the min width used by the recurrence we need to account
5563       // for casts on the input operands of the recurrence.
5564       MaxWidth = std::min<unsigned>(
5565           MaxWidth, std::min<unsigned>(
5566                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5567                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5568     }
5569   } else {
5570     for (Type *T : ElementTypesInLoop) {
5571       MinWidth = std::min<unsigned>(
5572           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5573       MaxWidth = std::max<unsigned>(
5574           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5575     }
5576   }
5577   return {MinWidth, MaxWidth};
5578 }
5579 
5580 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5581   ElementTypesInLoop.clear();
5582   // For each block.
5583   for (BasicBlock *BB : TheLoop->blocks()) {
5584     // For each instruction in the loop.
5585     for (Instruction &I : BB->instructionsWithoutDebug()) {
5586       Type *T = I.getType();
5587 
5588       // Skip ignored values.
5589       if (ValuesToIgnore.count(&I))
5590         continue;
5591 
5592       // Only examine Loads, Stores and PHINodes.
5593       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5594         continue;
5595 
5596       // Examine PHI nodes that are reduction variables. Update the type to
5597       // account for the recurrence type.
5598       if (auto *PN = dyn_cast<PHINode>(&I)) {
5599         if (!Legal->isReductionVariable(PN))
5600           continue;
5601         const RecurrenceDescriptor &RdxDesc =
5602             Legal->getReductionVars().find(PN)->second;
5603         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5604             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5605                                       RdxDesc.getRecurrenceType(),
5606                                       TargetTransformInfo::ReductionFlags()))
5607           continue;
5608         T = RdxDesc.getRecurrenceType();
5609       }
5610 
5611       // Examine the stored values.
5612       if (auto *ST = dyn_cast<StoreInst>(&I))
5613         T = ST->getValueOperand()->getType();
5614 
5615       assert(T->isSized() &&
5616              "Expected the load/store/recurrence type to be sized");
5617 
5618       ElementTypesInLoop.insert(T);
5619     }
5620   }
5621 }
5622 
5623 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5624                                                            unsigned LoopCost) {
5625   // -- The interleave heuristics --
5626   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5627   // There are many micro-architectural considerations that we can't predict
5628   // at this level. For example, frontend pressure (on decode or fetch) due to
5629   // code size, or the number and capabilities of the execution ports.
5630   //
5631   // We use the following heuristics to select the interleave count:
5632   // 1. If the code has reductions, then we interleave to break the cross
5633   // iteration dependency.
5634   // 2. If the loop is really small, then we interleave to reduce the loop
5635   // overhead.
5636   // 3. We don't interleave if we think that we will spill registers to memory
5637   // due to the increased register pressure.
5638 
5639   if (!isScalarEpilogueAllowed())
5640     return 1;
5641 
5642   // We used the distance for the interleave count.
5643   if (Legal->getMaxSafeDepDistBytes() != -1U)
5644     return 1;
5645 
5646   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5647   const bool HasReductions = !Legal->getReductionVars().empty();
5648   // Do not interleave loops with a relatively small known or estimated trip
5649   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5650   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5651   // because with the above conditions interleaving can expose ILP and break
5652   // cross iteration dependences for reductions.
5653   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5654       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5655     return 1;
5656 
5657   // If we did not calculate the cost for VF (because the user selected the VF)
5658   // then we calculate the cost of VF here.
5659   if (LoopCost == 0) {
5660     InstructionCost C = expectedCost(VF).first;
5661     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
5662     LoopCost = *C.getValue();
5663 
5664     // Loop body is free and there is no need for interleaving.
5665     if (LoopCost == 0)
5666       return 1;
5667   }
5668 
5669   RegisterUsage R = calculateRegisterUsage({VF})[0];
5670   // We divide by these constants so assume that we have at least one
5671   // instruction that uses at least one register.
5672   for (auto& pair : R.MaxLocalUsers) {
5673     pair.second = std::max(pair.second, 1U);
5674   }
5675 
5676   // We calculate the interleave count using the following formula.
5677   // Subtract the number of loop invariants from the number of available
5678   // registers. These registers are used by all of the interleaved instances.
5679   // Next, divide the remaining registers by the number of registers that is
5680   // required by the loop, in order to estimate how many parallel instances
5681   // fit without causing spills. All of this is rounded down if necessary to be
5682   // a power of two. We want power of two interleave count to simplify any
5683   // addressing operations or alignment considerations.
5684   // We also want power of two interleave counts to ensure that the induction
5685   // variable of the vector loop wraps to zero, when tail is folded by masking;
5686   // this currently happens when OptForSize, in which case IC is set to 1 above.
5687   unsigned IC = UINT_MAX;
5688 
5689   for (auto& pair : R.MaxLocalUsers) {
5690     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5691     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5692                       << " registers of "
5693                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5694     if (VF.isScalar()) {
5695       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5696         TargetNumRegisters = ForceTargetNumScalarRegs;
5697     } else {
5698       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5699         TargetNumRegisters = ForceTargetNumVectorRegs;
5700     }
5701     unsigned MaxLocalUsers = pair.second;
5702     unsigned LoopInvariantRegs = 0;
5703     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5704       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5705 
5706     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5707     // Don't count the induction variable as interleaved.
5708     if (EnableIndVarRegisterHeur) {
5709       TmpIC =
5710           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5711                         std::max(1U, (MaxLocalUsers - 1)));
5712     }
5713 
5714     IC = std::min(IC, TmpIC);
5715   }
5716 
5717   // Clamp the interleave ranges to reasonable counts.
5718   unsigned MaxInterleaveCount =
5719       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5720 
5721   // Check if the user has overridden the max.
5722   if (VF.isScalar()) {
5723     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5724       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5725   } else {
5726     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5727       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5728   }
5729 
5730   // If trip count is known or estimated compile time constant, limit the
5731   // interleave count to be less than the trip count divided by VF, provided it
5732   // is at least 1.
5733   //
5734   // For scalable vectors we can't know if interleaving is beneficial. It may
5735   // not be beneficial for small loops if none of the lanes in the second vector
5736   // iterations is enabled. However, for larger loops, there is likely to be a
5737   // similar benefit as for fixed-width vectors. For now, we choose to leave
5738   // the InterleaveCount as if vscale is '1', although if some information about
5739   // the vector is known (e.g. min vector size), we can make a better decision.
5740   if (BestKnownTC) {
5741     MaxInterleaveCount =
5742         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5743     // Make sure MaxInterleaveCount is greater than 0.
5744     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5745   }
5746 
5747   assert(MaxInterleaveCount > 0 &&
5748          "Maximum interleave count must be greater than 0");
5749 
5750   // Clamp the calculated IC to be between the 1 and the max interleave count
5751   // that the target and trip count allows.
5752   if (IC > MaxInterleaveCount)
5753     IC = MaxInterleaveCount;
5754   else
5755     // Make sure IC is greater than 0.
5756     IC = std::max(1u, IC);
5757 
5758   assert(IC > 0 && "Interleave count must be greater than 0.");
5759 
5760   // Interleave if we vectorized this loop and there is a reduction that could
5761   // benefit from interleaving.
5762   if (VF.isVector() && HasReductions) {
5763     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5764     return IC;
5765   }
5766 
5767   // For any scalar loop that either requires runtime checks or predication we
5768   // are better off leaving this to the unroller. Note that if we've already
5769   // vectorized the loop we will have done the runtime check and so interleaving
5770   // won't require further checks.
5771   bool ScalarInterleavingRequiresPredication =
5772       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5773          return Legal->blockNeedsPredication(BB);
5774        }));
5775   bool ScalarInterleavingRequiresRuntimePointerCheck =
5776       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5777 
5778   // We want to interleave small loops in order to reduce the loop overhead and
5779   // potentially expose ILP opportunities.
5780   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5781                     << "LV: IC is " << IC << '\n'
5782                     << "LV: VF is " << VF << '\n');
5783   const bool AggressivelyInterleaveReductions =
5784       TTI.enableAggressiveInterleaving(HasReductions);
5785   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5786       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5787     // We assume that the cost overhead is 1 and we use the cost model
5788     // to estimate the cost of the loop and interleave until the cost of the
5789     // loop overhead is about 5% of the cost of the loop.
5790     unsigned SmallIC =
5791         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5792 
5793     // Interleave until store/load ports (estimated by max interleave count) are
5794     // saturated.
5795     unsigned NumStores = Legal->getNumStores();
5796     unsigned NumLoads = Legal->getNumLoads();
5797     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5798     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5799 
5800     // There is little point in interleaving for reductions containing selects
5801     // and compares when VF=1 since it may just create more overhead than it's
5802     // worth for loops with small trip counts. This is because we still have to
5803     // do the final reduction after the loop.
5804     bool HasSelectCmpReductions =
5805         HasReductions &&
5806         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5807           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5808           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
5809               RdxDesc.getRecurrenceKind());
5810         });
5811     if (HasSelectCmpReductions) {
5812       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5813       return 1;
5814     }
5815 
5816     // If we have a scalar reduction (vector reductions are already dealt with
5817     // by this point), we can increase the critical path length if the loop
5818     // we're interleaving is inside another loop. For tree-wise reductions
5819     // set the limit to 2, and for ordered reductions it's best to disable
5820     // interleaving entirely.
5821     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5822       bool HasOrderedReductions =
5823           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5824             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5825             return RdxDesc.isOrdered();
5826           });
5827       if (HasOrderedReductions) {
5828         LLVM_DEBUG(
5829             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5830         return 1;
5831       }
5832 
5833       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5834       SmallIC = std::min(SmallIC, F);
5835       StoresIC = std::min(StoresIC, F);
5836       LoadsIC = std::min(LoadsIC, F);
5837     }
5838 
5839     if (EnableLoadStoreRuntimeInterleave &&
5840         std::max(StoresIC, LoadsIC) > SmallIC) {
5841       LLVM_DEBUG(
5842           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5843       return std::max(StoresIC, LoadsIC);
5844     }
5845 
5846     // If there are scalar reductions and TTI has enabled aggressive
5847     // interleaving for reductions, we will interleave to expose ILP.
5848     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5849         AggressivelyInterleaveReductions) {
5850       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5851       // Interleave no less than SmallIC but not as aggressive as the normal IC
5852       // to satisfy the rare situation when resources are too limited.
5853       return std::max(IC / 2, SmallIC);
5854     } else {
5855       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5856       return SmallIC;
5857     }
5858   }
5859 
5860   // Interleave if this is a large loop (small loops are already dealt with by
5861   // this point) that could benefit from interleaving.
5862   if (AggressivelyInterleaveReductions) {
5863     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5864     return IC;
5865   }
5866 
5867   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5868   return 1;
5869 }
5870 
5871 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5872 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5873   // This function calculates the register usage by measuring the highest number
5874   // of values that are alive at a single location. Obviously, this is a very
5875   // rough estimation. We scan the loop in a topological order in order and
5876   // assign a number to each instruction. We use RPO to ensure that defs are
5877   // met before their users. We assume that each instruction that has in-loop
5878   // users starts an interval. We record every time that an in-loop value is
5879   // used, so we have a list of the first and last occurrences of each
5880   // instruction. Next, we transpose this data structure into a multi map that
5881   // holds the list of intervals that *end* at a specific location. This multi
5882   // map allows us to perform a linear search. We scan the instructions linearly
5883   // and record each time that a new interval starts, by placing it in a set.
5884   // If we find this value in the multi-map then we remove it from the set.
5885   // The max register usage is the maximum size of the set.
5886   // We also search for instructions that are defined outside the loop, but are
5887   // used inside the loop. We need this number separately from the max-interval
5888   // usage number because when we unroll, loop-invariant values do not take
5889   // more register.
5890   LoopBlocksDFS DFS(TheLoop);
5891   DFS.perform(LI);
5892 
5893   RegisterUsage RU;
5894 
5895   // Each 'key' in the map opens a new interval. The values
5896   // of the map are the index of the 'last seen' usage of the
5897   // instruction that is the key.
5898   using IntervalMap = DenseMap<Instruction *, unsigned>;
5899 
5900   // Maps instruction to its index.
5901   SmallVector<Instruction *, 64> IdxToInstr;
5902   // Marks the end of each interval.
5903   IntervalMap EndPoint;
5904   // Saves the list of instruction indices that are used in the loop.
5905   SmallPtrSet<Instruction *, 8> Ends;
5906   // Saves the list of values that are used in the loop but are
5907   // defined outside the loop, such as arguments and constants.
5908   SmallPtrSet<Value *, 8> LoopInvariants;
5909 
5910   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5911     for (Instruction &I : BB->instructionsWithoutDebug()) {
5912       IdxToInstr.push_back(&I);
5913 
5914       // Save the end location of each USE.
5915       for (Value *U : I.operands()) {
5916         auto *Instr = dyn_cast<Instruction>(U);
5917 
5918         // Ignore non-instruction values such as arguments, constants, etc.
5919         if (!Instr)
5920           continue;
5921 
5922         // If this instruction is outside the loop then record it and continue.
5923         if (!TheLoop->contains(Instr)) {
5924           LoopInvariants.insert(Instr);
5925           continue;
5926         }
5927 
5928         // Overwrite previous end points.
5929         EndPoint[Instr] = IdxToInstr.size();
5930         Ends.insert(Instr);
5931       }
5932     }
5933   }
5934 
5935   // Saves the list of intervals that end with the index in 'key'.
5936   using InstrList = SmallVector<Instruction *, 2>;
5937   DenseMap<unsigned, InstrList> TransposeEnds;
5938 
5939   // Transpose the EndPoints to a list of values that end at each index.
5940   for (auto &Interval : EndPoint)
5941     TransposeEnds[Interval.second].push_back(Interval.first);
5942 
5943   SmallPtrSet<Instruction *, 8> OpenIntervals;
5944   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5945   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5946 
5947   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5948 
5949   const auto &TTICapture = TTI;
5950   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5951     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5952       return 0;
5953     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5954   };
5955 
5956   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5957     Instruction *I = IdxToInstr[i];
5958 
5959     // Remove all of the instructions that end at this location.
5960     InstrList &List = TransposeEnds[i];
5961     for (Instruction *ToRemove : List)
5962       OpenIntervals.erase(ToRemove);
5963 
5964     // Ignore instructions that are never used within the loop.
5965     if (!Ends.count(I))
5966       continue;
5967 
5968     // Skip ignored values.
5969     if (ValuesToIgnore.count(I))
5970       continue;
5971 
5972     // For each VF find the maximum usage of registers.
5973     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5974       // Count the number of live intervals.
5975       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5976 
5977       if (VFs[j].isScalar()) {
5978         for (auto Inst : OpenIntervals) {
5979           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5980           if (RegUsage.find(ClassID) == RegUsage.end())
5981             RegUsage[ClassID] = 1;
5982           else
5983             RegUsage[ClassID] += 1;
5984         }
5985       } else {
5986         collectUniformsAndScalars(VFs[j]);
5987         for (auto Inst : OpenIntervals) {
5988           // Skip ignored values for VF > 1.
5989           if (VecValuesToIgnore.count(Inst))
5990             continue;
5991           if (isScalarAfterVectorization(Inst, VFs[j])) {
5992             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5993             if (RegUsage.find(ClassID) == RegUsage.end())
5994               RegUsage[ClassID] = 1;
5995             else
5996               RegUsage[ClassID] += 1;
5997           } else {
5998             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5999             if (RegUsage.find(ClassID) == RegUsage.end())
6000               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6001             else
6002               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6003           }
6004         }
6005       }
6006 
6007       for (auto& pair : RegUsage) {
6008         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6009           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6010         else
6011           MaxUsages[j][pair.first] = pair.second;
6012       }
6013     }
6014 
6015     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6016                       << OpenIntervals.size() << '\n');
6017 
6018     // Add the current instruction to the list of open intervals.
6019     OpenIntervals.insert(I);
6020   }
6021 
6022   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6023     SmallMapVector<unsigned, unsigned, 4> Invariant;
6024 
6025     for (auto Inst : LoopInvariants) {
6026       unsigned Usage =
6027           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6028       unsigned ClassID =
6029           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6030       if (Invariant.find(ClassID) == Invariant.end())
6031         Invariant[ClassID] = Usage;
6032       else
6033         Invariant[ClassID] += Usage;
6034     }
6035 
6036     LLVM_DEBUG({
6037       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6038       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6039              << " item\n";
6040       for (const auto &pair : MaxUsages[i]) {
6041         dbgs() << "LV(REG): RegisterClass: "
6042                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6043                << " registers\n";
6044       }
6045       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6046              << " item\n";
6047       for (const auto &pair : Invariant) {
6048         dbgs() << "LV(REG): RegisterClass: "
6049                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6050                << " registers\n";
6051       }
6052     });
6053 
6054     RU.LoopInvariantRegs = Invariant;
6055     RU.MaxLocalUsers = MaxUsages[i];
6056     RUs[i] = RU;
6057   }
6058 
6059   return RUs;
6060 }
6061 
6062 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
6063                                                            ElementCount VF) {
6064   // TODO: Cost model for emulated masked load/store is completely
6065   // broken. This hack guides the cost model to use an artificially
6066   // high enough value to practically disable vectorization with such
6067   // operations, except where previously deployed legality hack allowed
6068   // using very low cost values. This is to avoid regressions coming simply
6069   // from moving "masked load/store" check from legality to cost model.
6070   // Masked Load/Gather emulation was previously never allowed.
6071   // Limited number of Masked Store/Scatter emulation was allowed.
6072   assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction");
6073   return isa<LoadInst>(I) ||
6074          (isa<StoreInst>(I) &&
6075           NumPredStores > NumberOfStoresToPredicate);
6076 }
6077 
6078 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6079   // If we aren't vectorizing the loop, or if we've already collected the
6080   // instructions to scalarize, there's nothing to do. Collection may already
6081   // have occurred if we have a user-selected VF and are now computing the
6082   // expected cost for interleaving.
6083   if (VF.isScalar() || VF.isZero() ||
6084       InstsToScalarize.find(VF) != InstsToScalarize.end())
6085     return;
6086 
6087   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6088   // not profitable to scalarize any instructions, the presence of VF in the
6089   // map will indicate that we've analyzed it already.
6090   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6091 
6092   PredicatedBBsAfterVectorization[VF].clear();
6093 
6094   // Find all the instructions that are scalar with predication in the loop and
6095   // determine if it would be better to not if-convert the blocks they are in.
6096   // If so, we also record the instructions to scalarize.
6097   for (BasicBlock *BB : TheLoop->blocks()) {
6098     if (!blockNeedsPredicationForAnyReason(BB))
6099       continue;
6100     for (Instruction &I : *BB)
6101       if (isScalarWithPredication(&I, VF)) {
6102         ScalarCostsTy ScalarCosts;
6103         // Do not apply discount if scalable, because that would lead to
6104         // invalid scalarization costs.
6105         // Do not apply discount logic if hacked cost is needed
6106         // for emulated masked memrefs.
6107         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
6108             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6109           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6110         // Remember that BB will remain after vectorization.
6111         PredicatedBBsAfterVectorization[VF].insert(BB);
6112       }
6113   }
6114 }
6115 
6116 int LoopVectorizationCostModel::computePredInstDiscount(
6117     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6118   assert(!isUniformAfterVectorization(PredInst, VF) &&
6119          "Instruction marked uniform-after-vectorization will be predicated");
6120 
6121   // Initialize the discount to zero, meaning that the scalar version and the
6122   // vector version cost the same.
6123   InstructionCost Discount = 0;
6124 
6125   // Holds instructions to analyze. The instructions we visit are mapped in
6126   // ScalarCosts. Those instructions are the ones that would be scalarized if
6127   // we find that the scalar version costs less.
6128   SmallVector<Instruction *, 8> Worklist;
6129 
6130   // Returns true if the given instruction can be scalarized.
6131   auto canBeScalarized = [&](Instruction *I) -> bool {
6132     // We only attempt to scalarize instructions forming a single-use chain
6133     // from the original predicated block that would otherwise be vectorized.
6134     // Although not strictly necessary, we give up on instructions we know will
6135     // already be scalar to avoid traversing chains that are unlikely to be
6136     // beneficial.
6137     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6138         isScalarAfterVectorization(I, VF))
6139       return false;
6140 
6141     // If the instruction is scalar with predication, it will be analyzed
6142     // separately. We ignore it within the context of PredInst.
6143     if (isScalarWithPredication(I, VF))
6144       return false;
6145 
6146     // If any of the instruction's operands are uniform after vectorization,
6147     // the instruction cannot be scalarized. This prevents, for example, a
6148     // masked load from being scalarized.
6149     //
6150     // We assume we will only emit a value for lane zero of an instruction
6151     // marked uniform after vectorization, rather than VF identical values.
6152     // Thus, if we scalarize an instruction that uses a uniform, we would
6153     // create uses of values corresponding to the lanes we aren't emitting code
6154     // for. This behavior can be changed by allowing getScalarValue to clone
6155     // the lane zero values for uniforms rather than asserting.
6156     for (Use &U : I->operands())
6157       if (auto *J = dyn_cast<Instruction>(U.get()))
6158         if (isUniformAfterVectorization(J, VF))
6159           return false;
6160 
6161     // Otherwise, we can scalarize the instruction.
6162     return true;
6163   };
6164 
6165   // Compute the expected cost discount from scalarizing the entire expression
6166   // feeding the predicated instruction. We currently only consider expressions
6167   // that are single-use instruction chains.
6168   Worklist.push_back(PredInst);
6169   while (!Worklist.empty()) {
6170     Instruction *I = Worklist.pop_back_val();
6171 
6172     // If we've already analyzed the instruction, there's nothing to do.
6173     if (ScalarCosts.find(I) != ScalarCosts.end())
6174       continue;
6175 
6176     // Compute the cost of the vector instruction. Note that this cost already
6177     // includes the scalarization overhead of the predicated instruction.
6178     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6179 
6180     // Compute the cost of the scalarized instruction. This cost is the cost of
6181     // the instruction as if it wasn't if-converted and instead remained in the
6182     // predicated block. We will scale this cost by block probability after
6183     // computing the scalarization overhead.
6184     InstructionCost ScalarCost =
6185         VF.getFixedValue() *
6186         getInstructionCost(I, ElementCount::getFixed(1)).first;
6187 
6188     // Compute the scalarization overhead of needed insertelement instructions
6189     // and phi nodes.
6190     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6191       ScalarCost += TTI.getScalarizationOverhead(
6192           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6193           APInt::getAllOnes(VF.getFixedValue()), true, false);
6194       ScalarCost +=
6195           VF.getFixedValue() *
6196           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6197     }
6198 
6199     // Compute the scalarization overhead of needed extractelement
6200     // instructions. For each of the instruction's operands, if the operand can
6201     // be scalarized, add it to the worklist; otherwise, account for the
6202     // overhead.
6203     for (Use &U : I->operands())
6204       if (auto *J = dyn_cast<Instruction>(U.get())) {
6205         assert(VectorType::isValidElementType(J->getType()) &&
6206                "Instruction has non-scalar type");
6207         if (canBeScalarized(J))
6208           Worklist.push_back(J);
6209         else if (needsExtract(J, VF)) {
6210           ScalarCost += TTI.getScalarizationOverhead(
6211               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6212               APInt::getAllOnes(VF.getFixedValue()), false, true);
6213         }
6214       }
6215 
6216     // Scale the total scalar cost by block probability.
6217     ScalarCost /= getReciprocalPredBlockProb();
6218 
6219     // Compute the discount. A non-negative discount means the vector version
6220     // of the instruction costs more, and scalarizing would be beneficial.
6221     Discount += VectorCost - ScalarCost;
6222     ScalarCosts[I] = ScalarCost;
6223   }
6224 
6225   return *Discount.getValue();
6226 }
6227 
6228 LoopVectorizationCostModel::VectorizationCostTy
6229 LoopVectorizationCostModel::expectedCost(
6230     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6231   VectorizationCostTy Cost;
6232 
6233   // For each block.
6234   for (BasicBlock *BB : TheLoop->blocks()) {
6235     VectorizationCostTy BlockCost;
6236 
6237     // For each instruction in the old loop.
6238     for (Instruction &I : BB->instructionsWithoutDebug()) {
6239       // Skip ignored values.
6240       if (ValuesToIgnore.count(&I) ||
6241           (VF.isVector() && VecValuesToIgnore.count(&I)))
6242         continue;
6243 
6244       VectorizationCostTy C = getInstructionCost(&I, VF);
6245 
6246       // Check if we should override the cost.
6247       if (C.first.isValid() &&
6248           ForceTargetInstructionCost.getNumOccurrences() > 0)
6249         C.first = InstructionCost(ForceTargetInstructionCost);
6250 
6251       // Keep a list of instructions with invalid costs.
6252       if (Invalid && !C.first.isValid())
6253         Invalid->emplace_back(&I, VF);
6254 
6255       BlockCost.first += C.first;
6256       BlockCost.second |= C.second;
6257       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6258                         << " for VF " << VF << " For instruction: " << I
6259                         << '\n');
6260     }
6261 
6262     // If we are vectorizing a predicated block, it will have been
6263     // if-converted. This means that the block's instructions (aside from
6264     // stores and instructions that may divide by zero) will now be
6265     // unconditionally executed. For the scalar case, we may not always execute
6266     // the predicated block, if it is an if-else block. Thus, scale the block's
6267     // cost by the probability of executing it. blockNeedsPredication from
6268     // Legal is used so as to not include all blocks in tail folded loops.
6269     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6270       BlockCost.first /= getReciprocalPredBlockProb();
6271 
6272     Cost.first += BlockCost.first;
6273     Cost.second |= BlockCost.second;
6274   }
6275 
6276   return Cost;
6277 }
6278 
6279 /// Gets Address Access SCEV after verifying that the access pattern
6280 /// is loop invariant except the induction variable dependence.
6281 ///
6282 /// This SCEV can be sent to the Target in order to estimate the address
6283 /// calculation cost.
6284 static const SCEV *getAddressAccessSCEV(
6285               Value *Ptr,
6286               LoopVectorizationLegality *Legal,
6287               PredicatedScalarEvolution &PSE,
6288               const Loop *TheLoop) {
6289 
6290   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6291   if (!Gep)
6292     return nullptr;
6293 
6294   // We are looking for a gep with all loop invariant indices except for one
6295   // which should be an induction variable.
6296   auto SE = PSE.getSE();
6297   unsigned NumOperands = Gep->getNumOperands();
6298   for (unsigned i = 1; i < NumOperands; ++i) {
6299     Value *Opd = Gep->getOperand(i);
6300     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6301         !Legal->isInductionVariable(Opd))
6302       return nullptr;
6303   }
6304 
6305   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6306   return PSE.getSCEV(Ptr);
6307 }
6308 
6309 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6310   return Legal->hasStride(I->getOperand(0)) ||
6311          Legal->hasStride(I->getOperand(1));
6312 }
6313 
6314 InstructionCost
6315 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6316                                                         ElementCount VF) {
6317   assert(VF.isVector() &&
6318          "Scalarization cost of instruction implies vectorization.");
6319   if (VF.isScalable())
6320     return InstructionCost::getInvalid();
6321 
6322   Type *ValTy = getLoadStoreType(I);
6323   auto SE = PSE.getSE();
6324 
6325   unsigned AS = getLoadStoreAddressSpace(I);
6326   Value *Ptr = getLoadStorePointerOperand(I);
6327   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6328   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6329   //       that it is being called from this specific place.
6330 
6331   // Figure out whether the access is strided and get the stride value
6332   // if it's known in compile time
6333   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6334 
6335   // Get the cost of the scalar memory instruction and address computation.
6336   InstructionCost Cost =
6337       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6338 
6339   // Don't pass *I here, since it is scalar but will actually be part of a
6340   // vectorized loop where the user of it is a vectorized instruction.
6341   const Align Alignment = getLoadStoreAlignment(I);
6342   Cost += VF.getKnownMinValue() *
6343           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6344                               AS, TTI::TCK_RecipThroughput);
6345 
6346   // Get the overhead of the extractelement and insertelement instructions
6347   // we might create due to scalarization.
6348   Cost += getScalarizationOverhead(I, VF);
6349 
6350   // If we have a predicated load/store, it will need extra i1 extracts and
6351   // conditional branches, but may not be executed for each vector lane. Scale
6352   // the cost by the probability of executing the predicated block.
6353   if (isPredicatedInst(I, VF)) {
6354     Cost /= getReciprocalPredBlockProb();
6355 
6356     // Add the cost of an i1 extract and a branch
6357     auto *Vec_i1Ty =
6358         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6359     Cost += TTI.getScalarizationOverhead(
6360         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6361         /*Insert=*/false, /*Extract=*/true);
6362     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6363 
6364     if (useEmulatedMaskMemRefHack(I, VF))
6365       // Artificially setting to a high enough value to practically disable
6366       // vectorization with such operations.
6367       Cost = 3000000;
6368   }
6369 
6370   return Cost;
6371 }
6372 
6373 InstructionCost
6374 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6375                                                     ElementCount VF) {
6376   Type *ValTy = getLoadStoreType(I);
6377   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6378   Value *Ptr = getLoadStorePointerOperand(I);
6379   unsigned AS = getLoadStoreAddressSpace(I);
6380   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6381   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6382 
6383   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6384          "Stride should be 1 or -1 for consecutive memory access");
6385   const Align Alignment = getLoadStoreAlignment(I);
6386   InstructionCost Cost = 0;
6387   if (Legal->isMaskRequired(I))
6388     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6389                                       CostKind);
6390   else
6391     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6392                                 CostKind, I);
6393 
6394   bool Reverse = ConsecutiveStride < 0;
6395   if (Reverse)
6396     Cost +=
6397         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6398   return Cost;
6399 }
6400 
6401 InstructionCost
6402 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6403                                                 ElementCount VF) {
6404   assert(Legal->isUniformMemOp(*I));
6405 
6406   Type *ValTy = getLoadStoreType(I);
6407   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6408   const Align Alignment = getLoadStoreAlignment(I);
6409   unsigned AS = getLoadStoreAddressSpace(I);
6410   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6411   if (isa<LoadInst>(I)) {
6412     return TTI.getAddressComputationCost(ValTy) +
6413            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6414                                CostKind) +
6415            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6416   }
6417   StoreInst *SI = cast<StoreInst>(I);
6418 
6419   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6420   return TTI.getAddressComputationCost(ValTy) +
6421          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6422                              CostKind) +
6423          (isLoopInvariantStoreValue
6424               ? 0
6425               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6426                                        VF.getKnownMinValue() - 1));
6427 }
6428 
6429 InstructionCost
6430 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6431                                                  ElementCount VF) {
6432   Type *ValTy = getLoadStoreType(I);
6433   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6434   const Align Alignment = getLoadStoreAlignment(I);
6435   const Value *Ptr = getLoadStorePointerOperand(I);
6436 
6437   return TTI.getAddressComputationCost(VectorTy) +
6438          TTI.getGatherScatterOpCost(
6439              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6440              TargetTransformInfo::TCK_RecipThroughput, I);
6441 }
6442 
6443 InstructionCost
6444 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6445                                                    ElementCount VF) {
6446   // TODO: Once we have support for interleaving with scalable vectors
6447   // we can calculate the cost properly here.
6448   if (VF.isScalable())
6449     return InstructionCost::getInvalid();
6450 
6451   Type *ValTy = getLoadStoreType(I);
6452   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6453   unsigned AS = getLoadStoreAddressSpace(I);
6454 
6455   auto Group = getInterleavedAccessGroup(I);
6456   assert(Group && "Fail to get an interleaved access group.");
6457 
6458   unsigned InterleaveFactor = Group->getFactor();
6459   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6460 
6461   // Holds the indices of existing members in the interleaved group.
6462   SmallVector<unsigned, 4> Indices;
6463   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6464     if (Group->getMember(IF))
6465       Indices.push_back(IF);
6466 
6467   // Calculate the cost of the whole interleaved group.
6468   bool UseMaskForGaps =
6469       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6470       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6471   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6472       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6473       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6474 
6475   if (Group->isReverse()) {
6476     // TODO: Add support for reversed masked interleaved access.
6477     assert(!Legal->isMaskRequired(I) &&
6478            "Reverse masked interleaved access not supported.");
6479     Cost +=
6480         Group->getNumMembers() *
6481         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6482   }
6483   return Cost;
6484 }
6485 
6486 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
6487     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6488   using namespace llvm::PatternMatch;
6489   // Early exit for no inloop reductions
6490   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6491     return None;
6492   auto *VectorTy = cast<VectorType>(Ty);
6493 
6494   // We are looking for a pattern of, and finding the minimal acceptable cost:
6495   //  reduce(mul(ext(A), ext(B))) or
6496   //  reduce(mul(A, B)) or
6497   //  reduce(ext(A)) or
6498   //  reduce(A).
6499   // The basic idea is that we walk down the tree to do that, finding the root
6500   // reduction instruction in InLoopReductionImmediateChains. From there we find
6501   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6502   // of the components. If the reduction cost is lower then we return it for the
6503   // reduction instruction and 0 for the other instructions in the pattern. If
6504   // it is not we return an invalid cost specifying the orignal cost method
6505   // should be used.
6506   Instruction *RetI = I;
6507   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6508     if (!RetI->hasOneUser())
6509       return None;
6510     RetI = RetI->user_back();
6511   }
6512   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
6513       RetI->user_back()->getOpcode() == Instruction::Add) {
6514     if (!RetI->hasOneUser())
6515       return None;
6516     RetI = RetI->user_back();
6517   }
6518 
6519   // Test if the found instruction is a reduction, and if not return an invalid
6520   // cost specifying the parent to use the original cost modelling.
6521   if (!InLoopReductionImmediateChains.count(RetI))
6522     return None;
6523 
6524   // Find the reduction this chain is a part of and calculate the basic cost of
6525   // the reduction on its own.
6526   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6527   Instruction *ReductionPhi = LastChain;
6528   while (!isa<PHINode>(ReductionPhi))
6529     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6530 
6531   const RecurrenceDescriptor &RdxDesc =
6532       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6533 
6534   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6535       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6536 
6537   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6538   // normal fmul instruction to the cost of the fadd reduction.
6539   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6540     BaseCost +=
6541         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6542 
6543   // If we're using ordered reductions then we can just return the base cost
6544   // here, since getArithmeticReductionCost calculates the full ordered
6545   // reduction cost when FP reassociation is not allowed.
6546   if (useOrderedReductions(RdxDesc))
6547     return BaseCost;
6548 
6549   // Get the operand that was not the reduction chain and match it to one of the
6550   // patterns, returning the better cost if it is found.
6551   Instruction *RedOp = RetI->getOperand(1) == LastChain
6552                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6553                            : dyn_cast<Instruction>(RetI->getOperand(1));
6554 
6555   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6556 
6557   Instruction *Op0, *Op1;
6558   if (RedOp &&
6559       match(RedOp,
6560             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6561       match(Op0, m_ZExtOrSExt(m_Value())) &&
6562       Op0->getOpcode() == Op1->getOpcode() &&
6563       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6564       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6565       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6566 
6567     // Matched reduce(ext(mul(ext(A), ext(B)))
6568     // Note that the extend opcodes need to all match, or if A==B they will have
6569     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6570     // which is equally fine.
6571     bool IsUnsigned = isa<ZExtInst>(Op0);
6572     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6573     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6574 
6575     InstructionCost ExtCost =
6576         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6577                              TTI::CastContextHint::None, CostKind, Op0);
6578     InstructionCost MulCost =
6579         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6580     InstructionCost Ext2Cost =
6581         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6582                              TTI::CastContextHint::None, CostKind, RedOp);
6583 
6584     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6585         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6586         CostKind);
6587 
6588     if (RedCost.isValid() &&
6589         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6590       return I == RetI ? RedCost : 0;
6591   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6592              !TheLoop->isLoopInvariant(RedOp)) {
6593     // Matched reduce(ext(A))
6594     bool IsUnsigned = isa<ZExtInst>(RedOp);
6595     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6596     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6597         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6598         CostKind);
6599 
6600     InstructionCost ExtCost =
6601         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6602                              TTI::CastContextHint::None, CostKind, RedOp);
6603     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6604       return I == RetI ? RedCost : 0;
6605   } else if (RedOp &&
6606              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6607     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6608         Op0->getOpcode() == Op1->getOpcode() &&
6609         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6610       bool IsUnsigned = isa<ZExtInst>(Op0);
6611       Type *Op0Ty = Op0->getOperand(0)->getType();
6612       Type *Op1Ty = Op1->getOperand(0)->getType();
6613       Type *LargestOpTy =
6614           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6615                                                                     : Op0Ty;
6616       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6617 
6618       // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
6619       // different sizes. We take the largest type as the ext to reduce, and add
6620       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6621       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6622           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6623           TTI::CastContextHint::None, CostKind, Op0);
6624       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6625           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6626           TTI::CastContextHint::None, CostKind, Op1);
6627       InstructionCost MulCost =
6628           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6629 
6630       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6631           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6632           CostKind);
6633       InstructionCost ExtraExtCost = 0;
6634       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6635         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6636         ExtraExtCost = TTI.getCastInstrCost(
6637             ExtraExtOp->getOpcode(), ExtType,
6638             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6639             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6640       }
6641 
6642       if (RedCost.isValid() &&
6643           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6644         return I == RetI ? RedCost : 0;
6645     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6646       // Matched reduce(mul())
6647       InstructionCost MulCost =
6648           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6649 
6650       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6651           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
6652           CostKind);
6653 
6654       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6655         return I == RetI ? RedCost : 0;
6656     }
6657   }
6658 
6659   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
6660 }
6661 
6662 InstructionCost
6663 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6664                                                      ElementCount VF) {
6665   // Calculate scalar cost only. Vectorization cost should be ready at this
6666   // moment.
6667   if (VF.isScalar()) {
6668     Type *ValTy = getLoadStoreType(I);
6669     const Align Alignment = getLoadStoreAlignment(I);
6670     unsigned AS = getLoadStoreAddressSpace(I);
6671 
6672     return TTI.getAddressComputationCost(ValTy) +
6673            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6674                                TTI::TCK_RecipThroughput, I);
6675   }
6676   return getWideningCost(I, VF);
6677 }
6678 
6679 LoopVectorizationCostModel::VectorizationCostTy
6680 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6681                                                ElementCount VF) {
6682   // If we know that this instruction will remain uniform, check the cost of
6683   // the scalar version.
6684   if (isUniformAfterVectorization(I, VF))
6685     VF = ElementCount::getFixed(1);
6686 
6687   if (VF.isVector() && isProfitableToScalarize(I, VF))
6688     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6689 
6690   // Forced scalars do not have any scalarization overhead.
6691   auto ForcedScalar = ForcedScalars.find(VF);
6692   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6693     auto InstSet = ForcedScalar->second;
6694     if (InstSet.count(I))
6695       return VectorizationCostTy(
6696           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6697            VF.getKnownMinValue()),
6698           false);
6699   }
6700 
6701   Type *VectorTy;
6702   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6703 
6704   bool TypeNotScalarized = false;
6705   if (VF.isVector() && VectorTy->isVectorTy()) {
6706     if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6707       if (VF.isScalable())
6708         // <vscale x 1 x iN> is assumed to be profitable over iN because
6709         // scalable registers are a distinct register class from scalar ones.
6710         // If we ever find a target which wants to lower scalable vectors
6711         // back to scalars, we'll need to update this code to explicitly
6712         // ask TTI about the register class uses for each part.
6713         TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6714       else
6715         TypeNotScalarized = NumParts < VF.getKnownMinValue();
6716     } else
6717       C = InstructionCost::getInvalid();
6718   }
6719   return VectorizationCostTy(C, TypeNotScalarized);
6720 }
6721 
6722 InstructionCost
6723 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6724                                                      ElementCount VF) const {
6725 
6726   // There is no mechanism yet to create a scalable scalarization loop,
6727   // so this is currently Invalid.
6728   if (VF.isScalable())
6729     return InstructionCost::getInvalid();
6730 
6731   if (VF.isScalar())
6732     return 0;
6733 
6734   InstructionCost Cost = 0;
6735   Type *RetTy = ToVectorTy(I->getType(), VF);
6736   if (!RetTy->isVoidTy() &&
6737       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6738     Cost += TTI.getScalarizationOverhead(
6739         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
6740         false);
6741 
6742   // Some targets keep addresses scalar.
6743   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6744     return Cost;
6745 
6746   // Some targets support efficient element stores.
6747   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6748     return Cost;
6749 
6750   // Collect operands to consider.
6751   CallInst *CI = dyn_cast<CallInst>(I);
6752   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6753 
6754   // Skip operands that do not require extraction/scalarization and do not incur
6755   // any overhead.
6756   SmallVector<Type *> Tys;
6757   for (auto *V : filterExtractingOperands(Ops, VF))
6758     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6759   return Cost + TTI.getOperandsScalarizationOverhead(
6760                     filterExtractingOperands(Ops, VF), Tys);
6761 }
6762 
6763 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6764   if (VF.isScalar())
6765     return;
6766   NumPredStores = 0;
6767   for (BasicBlock *BB : TheLoop->blocks()) {
6768     // For each instruction in the old loop.
6769     for (Instruction &I : *BB) {
6770       Value *Ptr =  getLoadStorePointerOperand(&I);
6771       if (!Ptr)
6772         continue;
6773 
6774       // TODO: We should generate better code and update the cost model for
6775       // predicated uniform stores. Today they are treated as any other
6776       // predicated store (see added test cases in
6777       // invariant-store-vectorization.ll).
6778       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6779         NumPredStores++;
6780 
6781       if (Legal->isUniformMemOp(I)) {
6782         // TODO: Avoid replicating loads and stores instead of
6783         // relying on instcombine to remove them.
6784         // Load: Scalar load + broadcast
6785         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6786         InstructionCost Cost;
6787         if (isa<StoreInst>(&I) && VF.isScalable() &&
6788             isLegalGatherOrScatter(&I, VF)) {
6789           Cost = getGatherScatterCost(&I, VF);
6790           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
6791         } else {
6792           Cost = getUniformMemOpCost(&I, VF);
6793           setWideningDecision(&I, VF, CM_Scalarize, Cost);
6794         }
6795         continue;
6796       }
6797 
6798       // We assume that widening is the best solution when possible.
6799       if (memoryInstructionCanBeWidened(&I, VF)) {
6800         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6801         int ConsecutiveStride = Legal->isConsecutivePtr(
6802             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6803         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6804                "Expected consecutive stride.");
6805         InstWidening Decision =
6806             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6807         setWideningDecision(&I, VF, Decision, Cost);
6808         continue;
6809       }
6810 
6811       // Choose between Interleaving, Gather/Scatter or Scalarization.
6812       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6813       unsigned NumAccesses = 1;
6814       if (isAccessInterleaved(&I)) {
6815         auto Group = getInterleavedAccessGroup(&I);
6816         assert(Group && "Fail to get an interleaved access group.");
6817 
6818         // Make one decision for the whole group.
6819         if (getWideningDecision(&I, VF) != CM_Unknown)
6820           continue;
6821 
6822         NumAccesses = Group->getNumMembers();
6823         if (interleavedAccessCanBeWidened(&I, VF))
6824           InterleaveCost = getInterleaveGroupCost(&I, VF);
6825       }
6826 
6827       InstructionCost GatherScatterCost =
6828           isLegalGatherOrScatter(&I, VF)
6829               ? getGatherScatterCost(&I, VF) * NumAccesses
6830               : InstructionCost::getInvalid();
6831 
6832       InstructionCost ScalarizationCost =
6833           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6834 
6835       // Choose better solution for the current VF,
6836       // write down this decision and use it during vectorization.
6837       InstructionCost Cost;
6838       InstWidening Decision;
6839       if (InterleaveCost <= GatherScatterCost &&
6840           InterleaveCost < ScalarizationCost) {
6841         Decision = CM_Interleave;
6842         Cost = InterleaveCost;
6843       } else if (GatherScatterCost < ScalarizationCost) {
6844         Decision = CM_GatherScatter;
6845         Cost = GatherScatterCost;
6846       } else {
6847         Decision = CM_Scalarize;
6848         Cost = ScalarizationCost;
6849       }
6850       // If the instructions belongs to an interleave group, the whole group
6851       // receives the same decision. The whole group receives the cost, but
6852       // the cost will actually be assigned to one instruction.
6853       if (auto Group = getInterleavedAccessGroup(&I))
6854         setWideningDecision(Group, VF, Decision, Cost);
6855       else
6856         setWideningDecision(&I, VF, Decision, Cost);
6857     }
6858   }
6859 
6860   // Make sure that any load of address and any other address computation
6861   // remains scalar unless there is gather/scatter support. This avoids
6862   // inevitable extracts into address registers, and also has the benefit of
6863   // activating LSR more, since that pass can't optimize vectorized
6864   // addresses.
6865   if (TTI.prefersVectorizedAddressing())
6866     return;
6867 
6868   // Start with all scalar pointer uses.
6869   SmallPtrSet<Instruction *, 8> AddrDefs;
6870   for (BasicBlock *BB : TheLoop->blocks())
6871     for (Instruction &I : *BB) {
6872       Instruction *PtrDef =
6873         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6874       if (PtrDef && TheLoop->contains(PtrDef) &&
6875           getWideningDecision(&I, VF) != CM_GatherScatter)
6876         AddrDefs.insert(PtrDef);
6877     }
6878 
6879   // Add all instructions used to generate the addresses.
6880   SmallVector<Instruction *, 4> Worklist;
6881   append_range(Worklist, AddrDefs);
6882   while (!Worklist.empty()) {
6883     Instruction *I = Worklist.pop_back_val();
6884     for (auto &Op : I->operands())
6885       if (auto *InstOp = dyn_cast<Instruction>(Op))
6886         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6887             AddrDefs.insert(InstOp).second)
6888           Worklist.push_back(InstOp);
6889   }
6890 
6891   for (auto *I : AddrDefs) {
6892     if (isa<LoadInst>(I)) {
6893       // Setting the desired widening decision should ideally be handled in
6894       // by cost functions, but since this involves the task of finding out
6895       // if the loaded register is involved in an address computation, it is
6896       // instead changed here when we know this is the case.
6897       InstWidening Decision = getWideningDecision(I, VF);
6898       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6899         // Scalarize a widened load of address.
6900         setWideningDecision(
6901             I, VF, CM_Scalarize,
6902             (VF.getKnownMinValue() *
6903              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6904       else if (auto Group = getInterleavedAccessGroup(I)) {
6905         // Scalarize an interleave group of address loads.
6906         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6907           if (Instruction *Member = Group->getMember(I))
6908             setWideningDecision(
6909                 Member, VF, CM_Scalarize,
6910                 (VF.getKnownMinValue() *
6911                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6912         }
6913       }
6914     } else
6915       // Make sure I gets scalarized and a cost estimate without
6916       // scalarization overhead.
6917       ForcedScalars[VF].insert(I);
6918   }
6919 }
6920 
6921 InstructionCost
6922 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6923                                                Type *&VectorTy) {
6924   Type *RetTy = I->getType();
6925   if (canTruncateToMinimalBitwidth(I, VF))
6926     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6927   auto SE = PSE.getSE();
6928   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6929 
6930   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6931                                                 ElementCount VF) -> bool {
6932     if (VF.isScalar())
6933       return true;
6934 
6935     auto Scalarized = InstsToScalarize.find(VF);
6936     assert(Scalarized != InstsToScalarize.end() &&
6937            "VF not yet analyzed for scalarization profitability");
6938     return !Scalarized->second.count(I) &&
6939            llvm::all_of(I->users(), [&](User *U) {
6940              auto *UI = cast<Instruction>(U);
6941              return !Scalarized->second.count(UI);
6942            });
6943   };
6944   (void) hasSingleCopyAfterVectorization;
6945 
6946   if (isScalarAfterVectorization(I, VF)) {
6947     // With the exception of GEPs and PHIs, after scalarization there should
6948     // only be one copy of the instruction generated in the loop. This is
6949     // because the VF is either 1, or any instructions that need scalarizing
6950     // have already been dealt with by the the time we get here. As a result,
6951     // it means we don't have to multiply the instruction cost by VF.
6952     assert(I->getOpcode() == Instruction::GetElementPtr ||
6953            I->getOpcode() == Instruction::PHI ||
6954            (I->getOpcode() == Instruction::BitCast &&
6955             I->getType()->isPointerTy()) ||
6956            hasSingleCopyAfterVectorization(I, VF));
6957     VectorTy = RetTy;
6958   } else
6959     VectorTy = ToVectorTy(RetTy, VF);
6960 
6961   // TODO: We need to estimate the cost of intrinsic calls.
6962   switch (I->getOpcode()) {
6963   case Instruction::GetElementPtr:
6964     // We mark this instruction as zero-cost because the cost of GEPs in
6965     // vectorized code depends on whether the corresponding memory instruction
6966     // is scalarized or not. Therefore, we handle GEPs with the memory
6967     // instruction cost.
6968     return 0;
6969   case Instruction::Br: {
6970     // In cases of scalarized and predicated instructions, there will be VF
6971     // predicated blocks in the vectorized loop. Each branch around these
6972     // blocks requires also an extract of its vector compare i1 element.
6973     bool ScalarPredicatedBB = false;
6974     BranchInst *BI = cast<BranchInst>(I);
6975     if (VF.isVector() && BI->isConditional() &&
6976         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6977          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
6978       ScalarPredicatedBB = true;
6979 
6980     if (ScalarPredicatedBB) {
6981       // Not possible to scalarize scalable vector with predicated instructions.
6982       if (VF.isScalable())
6983         return InstructionCost::getInvalid();
6984       // Return cost for branches around scalarized and predicated blocks.
6985       auto *Vec_i1Ty =
6986           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6987       return (
6988           TTI.getScalarizationOverhead(
6989               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
6990           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6991     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6992       // The back-edge branch will remain, as will all scalar branches.
6993       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6994     else
6995       // This branch will be eliminated by if-conversion.
6996       return 0;
6997     // Note: We currently assume zero cost for an unconditional branch inside
6998     // a predicated block since it will become a fall-through, although we
6999     // may decide in the future to call TTI for all branches.
7000   }
7001   case Instruction::PHI: {
7002     auto *Phi = cast<PHINode>(I);
7003 
7004     // First-order recurrences are replaced by vector shuffles inside the loop.
7005     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7006     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7007       return TTI.getShuffleCost(
7008           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7009           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7010 
7011     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7012     // converted into select instructions. We require N - 1 selects per phi
7013     // node, where N is the number of incoming values.
7014     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7015       return (Phi->getNumIncomingValues() - 1) *
7016              TTI.getCmpSelInstrCost(
7017                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7018                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7019                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7020 
7021     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7022   }
7023   case Instruction::UDiv:
7024   case Instruction::SDiv:
7025   case Instruction::URem:
7026   case Instruction::SRem:
7027     // If we have a predicated instruction, it may not be executed for each
7028     // vector lane. Get the scalarization cost and scale this amount by the
7029     // probability of executing the predicated block. If the instruction is not
7030     // predicated, we fall through to the next case.
7031     if (VF.isVector() && isScalarWithPredication(I, VF)) {
7032       InstructionCost Cost = 0;
7033 
7034       // These instructions have a non-void type, so account for the phi nodes
7035       // that we will create. This cost is likely to be zero. The phi node
7036       // cost, if any, should be scaled by the block probability because it
7037       // models a copy at the end of each predicated block.
7038       Cost += VF.getKnownMinValue() *
7039               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7040 
7041       // The cost of the non-predicated instruction.
7042       Cost += VF.getKnownMinValue() *
7043               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7044 
7045       // The cost of insertelement and extractelement instructions needed for
7046       // scalarization.
7047       Cost += getScalarizationOverhead(I, VF);
7048 
7049       // Scale the cost by the probability of executing the predicated blocks.
7050       // This assumes the predicated block for each vector lane is equally
7051       // likely.
7052       return Cost / getReciprocalPredBlockProb();
7053     }
7054     LLVM_FALLTHROUGH;
7055   case Instruction::Add:
7056   case Instruction::FAdd:
7057   case Instruction::Sub:
7058   case Instruction::FSub:
7059   case Instruction::Mul:
7060   case Instruction::FMul:
7061   case Instruction::FDiv:
7062   case Instruction::FRem:
7063   case Instruction::Shl:
7064   case Instruction::LShr:
7065   case Instruction::AShr:
7066   case Instruction::And:
7067   case Instruction::Or:
7068   case Instruction::Xor: {
7069     // Since we will replace the stride by 1 the multiplication should go away.
7070     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7071       return 0;
7072 
7073     // Detect reduction patterns
7074     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7075       return *RedCost;
7076 
7077     // Certain instructions can be cheaper to vectorize if they have a constant
7078     // second vector operand. One example of this are shifts on x86.
7079     Value *Op2 = I->getOperand(1);
7080     TargetTransformInfo::OperandValueProperties Op2VP;
7081     TargetTransformInfo::OperandValueKind Op2VK =
7082         TTI.getOperandInfo(Op2, Op2VP);
7083     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7084       Op2VK = TargetTransformInfo::OK_UniformValue;
7085 
7086     SmallVector<const Value *, 4> Operands(I->operand_values());
7087     return TTI.getArithmeticInstrCost(
7088         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7089         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7090   }
7091   case Instruction::FNeg: {
7092     return TTI.getArithmeticInstrCost(
7093         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7094         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7095         TargetTransformInfo::OP_None, I->getOperand(0), I);
7096   }
7097   case Instruction::Select: {
7098     SelectInst *SI = cast<SelectInst>(I);
7099     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7100     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7101 
7102     const Value *Op0, *Op1;
7103     using namespace llvm::PatternMatch;
7104     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7105                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7106       // select x, y, false --> x & y
7107       // select x, true, y --> x | y
7108       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7109       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7110       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7111       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7112       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7113               Op1->getType()->getScalarSizeInBits() == 1);
7114 
7115       SmallVector<const Value *, 2> Operands{Op0, Op1};
7116       return TTI.getArithmeticInstrCost(
7117           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7118           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7119     }
7120 
7121     Type *CondTy = SI->getCondition()->getType();
7122     if (!ScalarCond)
7123       CondTy = VectorType::get(CondTy, VF);
7124 
7125     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7126     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7127       Pred = Cmp->getPredicate();
7128     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7129                                   CostKind, I);
7130   }
7131   case Instruction::ICmp:
7132   case Instruction::FCmp: {
7133     Type *ValTy = I->getOperand(0)->getType();
7134     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7135     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7136       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7137     VectorTy = ToVectorTy(ValTy, VF);
7138     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7139                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7140                                   I);
7141   }
7142   case Instruction::Store:
7143   case Instruction::Load: {
7144     ElementCount Width = VF;
7145     if (Width.isVector()) {
7146       InstWidening Decision = getWideningDecision(I, Width);
7147       assert(Decision != CM_Unknown &&
7148              "CM decision should be taken at this point");
7149       if (Decision == CM_Scalarize) {
7150         if (VF.isScalable() && isa<StoreInst>(I))
7151           // We can't scalarize a scalable vector store (even a uniform one
7152           // currently), return an invalid cost so as to prevent vectorization.
7153           return InstructionCost::getInvalid();
7154         Width = ElementCount::getFixed(1);
7155       }
7156     }
7157     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7158     return getMemoryInstructionCost(I, VF);
7159   }
7160   case Instruction::BitCast:
7161     if (I->getType()->isPointerTy())
7162       return 0;
7163     LLVM_FALLTHROUGH;
7164   case Instruction::ZExt:
7165   case Instruction::SExt:
7166   case Instruction::FPToUI:
7167   case Instruction::FPToSI:
7168   case Instruction::FPExt:
7169   case Instruction::PtrToInt:
7170   case Instruction::IntToPtr:
7171   case Instruction::SIToFP:
7172   case Instruction::UIToFP:
7173   case Instruction::Trunc:
7174   case Instruction::FPTrunc: {
7175     // Computes the CastContextHint from a Load/Store instruction.
7176     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7177       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7178              "Expected a load or a store!");
7179 
7180       if (VF.isScalar() || !TheLoop->contains(I))
7181         return TTI::CastContextHint::Normal;
7182 
7183       switch (getWideningDecision(I, VF)) {
7184       case LoopVectorizationCostModel::CM_GatherScatter:
7185         return TTI::CastContextHint::GatherScatter;
7186       case LoopVectorizationCostModel::CM_Interleave:
7187         return TTI::CastContextHint::Interleave;
7188       case LoopVectorizationCostModel::CM_Scalarize:
7189       case LoopVectorizationCostModel::CM_Widen:
7190         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7191                                         : TTI::CastContextHint::Normal;
7192       case LoopVectorizationCostModel::CM_Widen_Reverse:
7193         return TTI::CastContextHint::Reversed;
7194       case LoopVectorizationCostModel::CM_Unknown:
7195         llvm_unreachable("Instr did not go through cost modelling?");
7196       }
7197 
7198       llvm_unreachable("Unhandled case!");
7199     };
7200 
7201     unsigned Opcode = I->getOpcode();
7202     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7203     // For Trunc, the context is the only user, which must be a StoreInst.
7204     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7205       if (I->hasOneUse())
7206         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7207           CCH = ComputeCCH(Store);
7208     }
7209     // For Z/Sext, the context is the operand, which must be a LoadInst.
7210     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7211              Opcode == Instruction::FPExt) {
7212       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7213         CCH = ComputeCCH(Load);
7214     }
7215 
7216     // We optimize the truncation of induction variables having constant
7217     // integer steps. The cost of these truncations is the same as the scalar
7218     // operation.
7219     if (isOptimizableIVTruncate(I, VF)) {
7220       auto *Trunc = cast<TruncInst>(I);
7221       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7222                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7223     }
7224 
7225     // Detect reduction patterns
7226     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7227       return *RedCost;
7228 
7229     Type *SrcScalarTy = I->getOperand(0)->getType();
7230     Type *SrcVecTy =
7231         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7232     if (canTruncateToMinimalBitwidth(I, VF)) {
7233       // This cast is going to be shrunk. This may remove the cast or it might
7234       // turn it into slightly different cast. For example, if MinBW == 16,
7235       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7236       //
7237       // Calculate the modified src and dest types.
7238       Type *MinVecTy = VectorTy;
7239       if (Opcode == Instruction::Trunc) {
7240         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7241         VectorTy =
7242             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7243       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7244         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7245         VectorTy =
7246             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7247       }
7248     }
7249 
7250     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7251   }
7252   case Instruction::Call: {
7253     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7254       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7255         return *RedCost;
7256     bool NeedToScalarize;
7257     CallInst *CI = cast<CallInst>(I);
7258     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7259     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7260       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7261       return std::min(CallCost, IntrinsicCost);
7262     }
7263     return CallCost;
7264   }
7265   case Instruction::ExtractValue:
7266     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7267   case Instruction::Alloca:
7268     // We cannot easily widen alloca to a scalable alloca, as
7269     // the result would need to be a vector of pointers.
7270     if (VF.isScalable())
7271       return InstructionCost::getInvalid();
7272     LLVM_FALLTHROUGH;
7273   default:
7274     // This opcode is unknown. Assume that it is the same as 'mul'.
7275     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7276   } // end of switch.
7277 }
7278 
7279 char LoopVectorize::ID = 0;
7280 
7281 static const char lv_name[] = "Loop Vectorization";
7282 
7283 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7284 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7285 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7286 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7287 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7288 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7289 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7290 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7291 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7292 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7293 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7294 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7295 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7296 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7297 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7298 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7299 
7300 namespace llvm {
7301 
7302 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7303 
7304 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7305                               bool VectorizeOnlyWhenForced) {
7306   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7307 }
7308 
7309 } // end namespace llvm
7310 
7311 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7312   // Check if the pointer operand of a load or store instruction is
7313   // consecutive.
7314   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7315     return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
7316   return false;
7317 }
7318 
7319 void LoopVectorizationCostModel::collectValuesToIgnore() {
7320   // Ignore ephemeral values.
7321   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7322 
7323   // Find all stores to invariant variables. Since they are going to sink
7324   // outside the loop we do not need calculate cost for them.
7325   for (BasicBlock *BB : TheLoop->blocks())
7326     for (Instruction &I : *BB) {
7327       StoreInst *SI;
7328       if ((SI = dyn_cast<StoreInst>(&I)) &&
7329           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7330         ValuesToIgnore.insert(&I);
7331     }
7332 
7333   // Ignore type-promoting instructions we identified during reduction
7334   // detection.
7335   for (auto &Reduction : Legal->getReductionVars()) {
7336     const RecurrenceDescriptor &RedDes = Reduction.second;
7337     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7338     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7339   }
7340   // Ignore type-casting instructions we identified during induction
7341   // detection.
7342   for (auto &Induction : Legal->getInductionVars()) {
7343     const InductionDescriptor &IndDes = Induction.second;
7344     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7345     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7346   }
7347 }
7348 
7349 void LoopVectorizationCostModel::collectInLoopReductions() {
7350   for (auto &Reduction : Legal->getReductionVars()) {
7351     PHINode *Phi = Reduction.first;
7352     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7353 
7354     // We don't collect reductions that are type promoted (yet).
7355     if (RdxDesc.getRecurrenceType() != Phi->getType())
7356       continue;
7357 
7358     // If the target would prefer this reduction to happen "in-loop", then we
7359     // want to record it as such.
7360     unsigned Opcode = RdxDesc.getOpcode();
7361     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7362         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7363                                    TargetTransformInfo::ReductionFlags()))
7364       continue;
7365 
7366     // Check that we can correctly put the reductions into the loop, by
7367     // finding the chain of operations that leads from the phi to the loop
7368     // exit value.
7369     SmallVector<Instruction *, 4> ReductionOperations =
7370         RdxDesc.getReductionOpChain(Phi, TheLoop);
7371     bool InLoop = !ReductionOperations.empty();
7372     if (InLoop) {
7373       InLoopReductionChains[Phi] = ReductionOperations;
7374       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7375       Instruction *LastChain = Phi;
7376       for (auto *I : ReductionOperations) {
7377         InLoopReductionImmediateChains[I] = LastChain;
7378         LastChain = I;
7379       }
7380     }
7381     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7382                       << " reduction for phi: " << *Phi << "\n");
7383   }
7384 }
7385 
7386 // TODO: we could return a pair of values that specify the max VF and
7387 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7388 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7389 // doesn't have a cost model that can choose which plan to execute if
7390 // more than one is generated.
7391 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7392                                  LoopVectorizationCostModel &CM) {
7393   unsigned WidestType;
7394   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7395   return WidestVectorRegBits / WidestType;
7396 }
7397 
7398 VectorizationFactor
7399 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7400   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7401   ElementCount VF = UserVF;
7402   // Outer loop handling: They may require CFG and instruction level
7403   // transformations before even evaluating whether vectorization is profitable.
7404   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7405   // the vectorization pipeline.
7406   if (!OrigLoop->isInnermost()) {
7407     // If the user doesn't provide a vectorization factor, determine a
7408     // reasonable one.
7409     if (UserVF.isZero()) {
7410       VF = ElementCount::getFixed(determineVPlanVF(
7411           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7412               .getFixedSize(),
7413           CM));
7414       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7415 
7416       // Make sure we have a VF > 1 for stress testing.
7417       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7418         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7419                           << "overriding computed VF.\n");
7420         VF = ElementCount::getFixed(4);
7421       }
7422     }
7423     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7424     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7425            "VF needs to be a power of two");
7426     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7427                       << "VF " << VF << " to build VPlans.\n");
7428     buildVPlans(VF, VF);
7429 
7430     // For VPlan build stress testing, we bail out after VPlan construction.
7431     if (VPlanBuildStressTest)
7432       return VectorizationFactor::Disabled();
7433 
7434     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7435   }
7436 
7437   LLVM_DEBUG(
7438       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7439                 "VPlan-native path.\n");
7440   return VectorizationFactor::Disabled();
7441 }
7442 
7443 Optional<VectorizationFactor>
7444 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7445   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7446   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7447   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7448     return None;
7449 
7450   // Invalidate interleave groups if all blocks of loop will be predicated.
7451   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7452       !useMaskedInterleavedAccesses(*TTI)) {
7453     LLVM_DEBUG(
7454         dbgs()
7455         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7456            "which requires masked-interleaved support.\n");
7457     if (CM.InterleaveInfo.invalidateGroups())
7458       // Invalidating interleave groups also requires invalidating all decisions
7459       // based on them, which includes widening decisions and uniform and scalar
7460       // values.
7461       CM.invalidateCostModelingDecisions();
7462   }
7463 
7464   ElementCount MaxUserVF =
7465       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7466   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7467   if (!UserVF.isZero() && UserVFIsLegal) {
7468     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7469            "VF needs to be a power of two");
7470     // Collect the instructions (and their associated costs) that will be more
7471     // profitable to scalarize.
7472     if (CM.selectUserVectorizationFactor(UserVF)) {
7473       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7474       CM.collectInLoopReductions();
7475       buildVPlansWithVPRecipes(UserVF, UserVF);
7476       LLVM_DEBUG(printPlans(dbgs()));
7477       return {{UserVF, 0, 0}};
7478     } else
7479       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7480                               "InvalidCost", ORE, OrigLoop);
7481   }
7482 
7483   // Populate the set of Vectorization Factor Candidates.
7484   ElementCountSet VFCandidates;
7485   for (auto VF = ElementCount::getFixed(1);
7486        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7487     VFCandidates.insert(VF);
7488   for (auto VF = ElementCount::getScalable(1);
7489        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7490     VFCandidates.insert(VF);
7491 
7492   for (const auto &VF : VFCandidates) {
7493     // Collect Uniform and Scalar instructions after vectorization with VF.
7494     CM.collectUniformsAndScalars(VF);
7495 
7496     // Collect the instructions (and their associated costs) that will be more
7497     // profitable to scalarize.
7498     if (VF.isVector())
7499       CM.collectInstsToScalarize(VF);
7500   }
7501 
7502   CM.collectInLoopReductions();
7503   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7504   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7505 
7506   LLVM_DEBUG(printPlans(dbgs()));
7507   if (!MaxFactors.hasVector())
7508     return VectorizationFactor::Disabled();
7509 
7510   // Select the optimal vectorization factor.
7511   VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates);
7512   assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7513   return VF;
7514 }
7515 
7516 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7517   assert(count_if(VPlans,
7518                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7519              1 &&
7520          "Best VF has not a single VPlan.");
7521 
7522   for (const VPlanPtr &Plan : VPlans) {
7523     if (Plan->hasVF(VF))
7524       return *Plan.get();
7525   }
7526   llvm_unreachable("No plan found!");
7527 }
7528 
7529 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7530   SmallVector<Metadata *, 4> MDs;
7531   // Reserve first location for self reference to the LoopID metadata node.
7532   MDs.push_back(nullptr);
7533   bool IsUnrollMetadata = false;
7534   MDNode *LoopID = L->getLoopID();
7535   if (LoopID) {
7536     // First find existing loop unrolling disable metadata.
7537     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7538       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7539       if (MD) {
7540         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7541         IsUnrollMetadata =
7542             S && S->getString().startswith("llvm.loop.unroll.disable");
7543       }
7544       MDs.push_back(LoopID->getOperand(i));
7545     }
7546   }
7547 
7548   if (!IsUnrollMetadata) {
7549     // Add runtime unroll disable metadata.
7550     LLVMContext &Context = L->getHeader()->getContext();
7551     SmallVector<Metadata *, 1> DisableOperands;
7552     DisableOperands.push_back(
7553         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7554     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7555     MDs.push_back(DisableNode);
7556     MDNode *NewLoopID = MDNode::get(Context, MDs);
7557     // Set operand 0 to refer to the loop id itself.
7558     NewLoopID->replaceOperandWith(0, NewLoopID);
7559     L->setLoopID(NewLoopID);
7560   }
7561 }
7562 
7563 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7564                                            VPlan &BestVPlan,
7565                                            InnerLoopVectorizer &ILV,
7566                                            DominatorTree *DT,
7567                                            bool IsEpilogueVectorization) {
7568   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7569                     << '\n');
7570 
7571   // Perform the actual loop transformation.
7572 
7573   // 1. Set up the skeleton for vectorization, including vector pre-header and
7574   // middle block. The vector loop is created during VPlan execution.
7575   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7576   Value *CanonicalIVStartValue;
7577   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7578       ILV.createVectorizedLoopSkeleton();
7579 
7580   // Only use noalias metadata when using memory checks guaranteeing no overlap
7581   // across all iterations.
7582   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7583   if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7584       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7585 
7586     //  We currently don't use LoopVersioning for the actual loop cloning but we
7587     //  still use it to add the noalias metadata.
7588     //  TODO: Find a better way to re-use LoopVersioning functionality to add
7589     //        metadata.
7590     State.LVer = std::make_unique<LoopVersioning>(
7591         *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7592         PSE.getSE());
7593     State.LVer->prepareNoAliasMetadata();
7594   }
7595 
7596   ILV.collectPoisonGeneratingRecipes(State);
7597 
7598   ILV.printDebugTracesAtStart();
7599 
7600   //===------------------------------------------------===//
7601   //
7602   // Notice: any optimization or new instruction that go
7603   // into the code below should also be implemented in
7604   // the cost-model.
7605   //
7606   //===------------------------------------------------===//
7607 
7608   // 2. Copy and widen instructions from the old loop into the new loop.
7609   BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7610                              ILV.getOrCreateVectorTripCount(nullptr),
7611                              CanonicalIVStartValue, State,
7612                              IsEpilogueVectorization);
7613 
7614   BestVPlan.execute(&State);
7615 
7616   // Keep all loop hints from the original loop on the vector loop (we'll
7617   // replace the vectorizer-specific hints below).
7618   MDNode *OrigLoopID = OrigLoop->getLoopID();
7619 
7620   Optional<MDNode *> VectorizedLoopID =
7621       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7622                                       LLVMLoopVectorizeFollowupVectorized});
7623 
7624   VPBasicBlock *HeaderVPBB =
7625       BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7626   Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7627   if (VectorizedLoopID)
7628     L->setLoopID(VectorizedLoopID.value());
7629   else {
7630     // Keep all loop hints from the original loop on the vector loop (we'll
7631     // replace the vectorizer-specific hints below).
7632     if (MDNode *LID = OrigLoop->getLoopID())
7633       L->setLoopID(LID);
7634 
7635     LoopVectorizeHints Hints(L, true, *ORE);
7636     Hints.setAlreadyVectorized();
7637   }
7638   // Disable runtime unrolling when vectorizing the epilogue loop.
7639   if (CanonicalIVStartValue)
7640     AddRuntimeUnrollDisableMetaData(L);
7641 
7642   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7643   //    predication, updating analyses.
7644   ILV.fixVectorizedLoop(State, BestVPlan);
7645 
7646   ILV.printDebugTracesAtEnd();
7647 }
7648 
7649 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7650 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7651   for (const auto &Plan : VPlans)
7652     if (PrintVPlansInDotFormat)
7653       Plan->printDOT(O);
7654     else
7655       Plan->print(O);
7656 }
7657 #endif
7658 
7659 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7660 
7661 //===--------------------------------------------------------------------===//
7662 // EpilogueVectorizerMainLoop
7663 //===--------------------------------------------------------------------===//
7664 
7665 /// This function is partially responsible for generating the control flow
7666 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7667 std::pair<BasicBlock *, Value *>
7668 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7669   MDNode *OrigLoopID = OrigLoop->getLoopID();
7670 
7671   // Workaround!  Compute the trip count of the original loop and cache it
7672   // before we start modifying the CFG.  This code has a systemic problem
7673   // wherein it tries to run analysis over partially constructed IR; this is
7674   // wrong, and not simply for SCEV.  The trip count of the original loop
7675   // simply happens to be prone to hitting this in practice.  In theory, we
7676   // can hit the same issue for any SCEV, or ValueTracking query done during
7677   // mutation.  See PR49900.
7678   getOrCreateTripCount(OrigLoop->getLoopPreheader());
7679   createVectorLoopSkeleton("");
7680 
7681   // Generate the code to check the minimum iteration count of the vector
7682   // epilogue (see below).
7683   EPI.EpilogueIterationCountCheck =
7684       emitIterationCountCheck(LoopScalarPreHeader, true);
7685   EPI.EpilogueIterationCountCheck->setName("iter.check");
7686 
7687   // Generate the code to check any assumptions that we've made for SCEV
7688   // expressions.
7689   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7690 
7691   // Generate the code that checks at runtime if arrays overlap. We put the
7692   // checks into a separate block to make the more common case of few elements
7693   // faster.
7694   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7695 
7696   // Generate the iteration count check for the main loop, *after* the check
7697   // for the epilogue loop, so that the path-length is shorter for the case
7698   // that goes directly through the vector epilogue. The longer-path length for
7699   // the main loop is compensated for, by the gain from vectorizing the larger
7700   // trip count. Note: the branch will get updated later on when we vectorize
7701   // the epilogue.
7702   EPI.MainLoopIterationCountCheck =
7703       emitIterationCountCheck(LoopScalarPreHeader, false);
7704 
7705   // Generate the induction variable.
7706   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7707 
7708   // Skip induction resume value creation here because they will be created in
7709   // the second pass. If we created them here, they wouldn't be used anyway,
7710   // because the vplan in the second pass still contains the inductions from the
7711   // original loop.
7712 
7713   return {completeLoopSkeleton(OrigLoopID), nullptr};
7714 }
7715 
7716 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7717   LLVM_DEBUG({
7718     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7719            << "Main Loop VF:" << EPI.MainLoopVF
7720            << ", Main Loop UF:" << EPI.MainLoopUF
7721            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7722            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7723   });
7724 }
7725 
7726 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7727   DEBUG_WITH_TYPE(VerboseDebug, {
7728     dbgs() << "intermediate fn:\n"
7729            << *OrigLoop->getHeader()->getParent() << "\n";
7730   });
7731 }
7732 
7733 BasicBlock *
7734 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7735                                                     bool ForEpilogue) {
7736   assert(Bypass && "Expected valid bypass basic block.");
7737   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7738   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7739   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
7740   // Reuse existing vector loop preheader for TC checks.
7741   // Note that new preheader block is generated for vector loop.
7742   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7743   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7744 
7745   // Generate code to check if the loop's trip count is less than VF * UF of the
7746   // main vector loop.
7747   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
7748       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7749 
7750   Value *CheckMinIters = Builder.CreateICmp(
7751       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7752       "min.iters.check");
7753 
7754   if (!ForEpilogue)
7755     TCCheckBlock->setName("vector.main.loop.iter.check");
7756 
7757   // Create new preheader for vector loop.
7758   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7759                                    DT, LI, nullptr, "vector.ph");
7760 
7761   if (ForEpilogue) {
7762     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7763                                  DT->getNode(Bypass)->getIDom()) &&
7764            "TC check is expected to dominate Bypass");
7765 
7766     // Update dominator for Bypass & LoopExit.
7767     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7768     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7769       // For loops with multiple exits, there's no edge from the middle block
7770       // to exit blocks (as the epilogue must run) and thus no need to update
7771       // the immediate dominator of the exit blocks.
7772       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7773 
7774     LoopBypassBlocks.push_back(TCCheckBlock);
7775 
7776     // Save the trip count so we don't have to regenerate it in the
7777     // vec.epilog.iter.check. This is safe to do because the trip count
7778     // generated here dominates the vector epilog iter check.
7779     EPI.TripCount = Count;
7780   }
7781 
7782   ReplaceInstWithInst(
7783       TCCheckBlock->getTerminator(),
7784       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7785 
7786   return TCCheckBlock;
7787 }
7788 
7789 //===--------------------------------------------------------------------===//
7790 // EpilogueVectorizerEpilogueLoop
7791 //===--------------------------------------------------------------------===//
7792 
7793 /// This function is partially responsible for generating the control flow
7794 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7795 std::pair<BasicBlock *, Value *>
7796 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7797   MDNode *OrigLoopID = OrigLoop->getLoopID();
7798   createVectorLoopSkeleton("vec.epilog.");
7799 
7800   // Now, compare the remaining count and if there aren't enough iterations to
7801   // execute the vectorized epilogue skip to the scalar part.
7802   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7803   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7804   LoopVectorPreHeader =
7805       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7806                  LI, nullptr, "vec.epilog.ph");
7807   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7808                                           VecEpilogueIterationCountCheck);
7809 
7810   // Adjust the control flow taking the state info from the main loop
7811   // vectorization into account.
7812   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7813          "expected this to be saved from the previous pass.");
7814   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7815       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7816 
7817   DT->changeImmediateDominator(LoopVectorPreHeader,
7818                                EPI.MainLoopIterationCountCheck);
7819 
7820   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7821       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7822 
7823   if (EPI.SCEVSafetyCheck)
7824     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7825         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7826   if (EPI.MemSafetyCheck)
7827     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7828         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7829 
7830   DT->changeImmediateDominator(
7831       VecEpilogueIterationCountCheck,
7832       VecEpilogueIterationCountCheck->getSinglePredecessor());
7833 
7834   DT->changeImmediateDominator(LoopScalarPreHeader,
7835                                EPI.EpilogueIterationCountCheck);
7836   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7837     // If there is an epilogue which must run, there's no edge from the
7838     // middle block to exit blocks  and thus no need to update the immediate
7839     // dominator of the exit blocks.
7840     DT->changeImmediateDominator(LoopExitBlock,
7841                                  EPI.EpilogueIterationCountCheck);
7842 
7843   // Keep track of bypass blocks, as they feed start values to the induction
7844   // phis in the scalar loop preheader.
7845   if (EPI.SCEVSafetyCheck)
7846     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7847   if (EPI.MemSafetyCheck)
7848     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7849   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7850 
7851   // The vec.epilog.iter.check block may contain Phi nodes from reductions which
7852   // merge control-flow from the latch block and the middle block. Update the
7853   // incoming values here and move the Phi into the preheader.
7854   SmallVector<PHINode *, 4> PhisInBlock;
7855   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7856     PhisInBlock.push_back(&Phi);
7857 
7858   for (PHINode *Phi : PhisInBlock) {
7859     Phi->replaceIncomingBlockWith(
7860         VecEpilogueIterationCountCheck->getSinglePredecessor(),
7861         VecEpilogueIterationCountCheck);
7862     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7863     if (EPI.SCEVSafetyCheck)
7864       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7865     if (EPI.MemSafetyCheck)
7866       Phi->removeIncomingValue(EPI.MemSafetyCheck);
7867     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7868   }
7869 
7870   // Generate a resume induction for the vector epilogue and put it in the
7871   // vector epilogue preheader
7872   Type *IdxTy = Legal->getWidestInductionType();
7873   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7874                                          LoopVectorPreHeader->getFirstNonPHI());
7875   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7876   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7877                            EPI.MainLoopIterationCountCheck);
7878 
7879   // Generate induction resume values. These variables save the new starting
7880   // indexes for the scalar loop. They are used to test if there are any tail
7881   // iterations left once the vector loop has completed.
7882   // Note that when the vectorized epilogue is skipped due to iteration count
7883   // check, then the resume value for the induction variable comes from
7884   // the trip count of the main vector loop, hence passing the AdditionalBypass
7885   // argument.
7886   createInductionResumeValues({VecEpilogueIterationCountCheck,
7887                                EPI.VectorTripCount} /* AdditionalBypass */);
7888 
7889   return {completeLoopSkeleton(OrigLoopID), EPResumeVal};
7890 }
7891 
7892 BasicBlock *
7893 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7894     BasicBlock *Bypass, BasicBlock *Insert) {
7895 
7896   assert(EPI.TripCount &&
7897          "Expected trip count to have been safed in the first pass.");
7898   assert(
7899       (!isa<Instruction>(EPI.TripCount) ||
7900        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7901       "saved trip count does not dominate insertion point.");
7902   Value *TC = EPI.TripCount;
7903   IRBuilder<> Builder(Insert->getTerminator());
7904   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7905 
7906   // Generate code to check if the loop's trip count is less than VF * UF of the
7907   // vector epilogue loop.
7908   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
7909       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7910 
7911   Value *CheckMinIters =
7912       Builder.CreateICmp(P, Count,
7913                          createStepForVF(Builder, Count->getType(),
7914                                          EPI.EpilogueVF, EPI.EpilogueUF),
7915                          "min.epilog.iters.check");
7916 
7917   ReplaceInstWithInst(
7918       Insert->getTerminator(),
7919       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7920 
7921   LoopBypassBlocks.push_back(Insert);
7922   return Insert;
7923 }
7924 
7925 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7926   LLVM_DEBUG({
7927     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7928            << "Epilogue Loop VF:" << EPI.EpilogueVF
7929            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7930   });
7931 }
7932 
7933 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7934   DEBUG_WITH_TYPE(VerboseDebug, {
7935     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7936   });
7937 }
7938 
7939 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7940     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7941   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7942   bool PredicateAtRangeStart = Predicate(Range.Start);
7943 
7944   for (ElementCount TmpVF = Range.Start * 2;
7945        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7946     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7947       Range.End = TmpVF;
7948       break;
7949     }
7950 
7951   return PredicateAtRangeStart;
7952 }
7953 
7954 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7955 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7956 /// of VF's starting at a given VF and extending it as much as possible. Each
7957 /// vectorization decision can potentially shorten this sub-range during
7958 /// buildVPlan().
7959 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7960                                            ElementCount MaxVF) {
7961   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7962   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7963     VFRange SubRange = {VF, MaxVFPlusOne};
7964     VPlans.push_back(buildVPlan(SubRange));
7965     VF = SubRange.End;
7966   }
7967 }
7968 
7969 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7970                                          VPlanPtr &Plan) {
7971   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7972 
7973   // Look for cached value.
7974   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7975   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7976   if (ECEntryIt != EdgeMaskCache.end())
7977     return ECEntryIt->second;
7978 
7979   VPValue *SrcMask = createBlockInMask(Src, Plan);
7980 
7981   // The terminator has to be a branch inst!
7982   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7983   assert(BI && "Unexpected terminator found");
7984 
7985   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7986     return EdgeMaskCache[Edge] = SrcMask;
7987 
7988   // If source is an exiting block, we know the exit edge is dynamically dead
7989   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
7990   // adding uses of an otherwise potentially dead instruction.
7991   if (OrigLoop->isLoopExiting(Src))
7992     return EdgeMaskCache[Edge] = SrcMask;
7993 
7994   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
7995   assert(EdgeMask && "No Edge Mask found for condition");
7996 
7997   if (BI->getSuccessor(0) != Dst)
7998     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
7999 
8000   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8001     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8002     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8003     // The select version does not introduce new UB if SrcMask is false and
8004     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8005     VPValue *False = Plan->getOrAddVPValue(
8006         ConstantInt::getFalse(BI->getCondition()->getType()));
8007     EdgeMask =
8008         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8009   }
8010 
8011   return EdgeMaskCache[Edge] = EdgeMask;
8012 }
8013 
8014 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8015   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8016 
8017   // Look for cached value.
8018   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8019   if (BCEntryIt != BlockMaskCache.end())
8020     return BCEntryIt->second;
8021 
8022   // All-one mask is modelled as no-mask following the convention for masked
8023   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8024   VPValue *BlockMask = nullptr;
8025 
8026   if (OrigLoop->getHeader() == BB) {
8027     if (!CM.blockNeedsPredicationForAnyReason(BB))
8028       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8029 
8030     assert(CM.foldTailByMasking() && "must fold the tail");
8031 
8032     // If we're using the active lane mask for control flow, then we get the
8033     // mask from the active lane mask PHI that is cached in the VPlan.
8034     PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask();
8035     if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow)
8036       return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi();
8037 
8038     // Introduce the early-exit compare IV <= BTC to form header block mask.
8039     // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8040     // constructing the desired canonical IV in the header block as its first
8041     // non-phi instructions.
8042 
8043     VPBasicBlock *HeaderVPBB =
8044         Plan->getVectorLoopRegion()->getEntryBasicBlock();
8045     auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8046     auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8047     HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8048 
8049     VPBuilder::InsertPointGuard Guard(Builder);
8050     Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8051     if (EmitGetActiveLaneMask != PredicationStyle::None) {
8052       VPValue *TC = Plan->getOrCreateTripCount();
8053       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
8054                                        nullptr, "active.lane.mask");
8055     } else {
8056       VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8057       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8058     }
8059     return BlockMaskCache[BB] = BlockMask;
8060   }
8061 
8062   // This is the block mask. We OR all incoming edges.
8063   for (auto *Predecessor : predecessors(BB)) {
8064     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8065     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8066       return BlockMaskCache[BB] = EdgeMask;
8067 
8068     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8069       BlockMask = EdgeMask;
8070       continue;
8071     }
8072 
8073     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8074   }
8075 
8076   return BlockMaskCache[BB] = BlockMask;
8077 }
8078 
8079 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8080                                                 ArrayRef<VPValue *> Operands,
8081                                                 VFRange &Range,
8082                                                 VPlanPtr &Plan) {
8083   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8084          "Must be called with either a load or store");
8085 
8086   auto willWiden = [&](ElementCount VF) -> bool {
8087     LoopVectorizationCostModel::InstWidening Decision =
8088         CM.getWideningDecision(I, VF);
8089     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8090            "CM decision should be taken at this point.");
8091     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8092       return true;
8093     if (CM.isScalarAfterVectorization(I, VF) ||
8094         CM.isProfitableToScalarize(I, VF))
8095       return false;
8096     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8097   };
8098 
8099   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8100     return nullptr;
8101 
8102   VPValue *Mask = nullptr;
8103   if (Legal->isMaskRequired(I))
8104     Mask = createBlockInMask(I->getParent(), Plan);
8105 
8106   // Determine if the pointer operand of the access is either consecutive or
8107   // reverse consecutive.
8108   LoopVectorizationCostModel::InstWidening Decision =
8109       CM.getWideningDecision(I, Range.Start);
8110   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8111   bool Consecutive =
8112       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8113 
8114   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8115     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8116                                               Consecutive, Reverse);
8117 
8118   StoreInst *Store = cast<StoreInst>(I);
8119   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8120                                             Mask, Consecutive, Reverse);
8121 }
8122 
8123 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8124 /// insert a recipe to expand the step for the induction recipe.
8125 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
8126     PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start,
8127     const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM,
8128     VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) {
8129   // Returns true if an instruction \p I should be scalarized instead of
8130   // vectorized for the chosen vectorization factor.
8131   auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8132     return CM.isScalarAfterVectorization(I, VF) ||
8133            CM.isProfitableToScalarize(I, VF);
8134   };
8135 
8136   bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8137       [&](ElementCount VF) {
8138         return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8139       },
8140       Range);
8141   assert(IndDesc.getStartValue() ==
8142          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8143   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8144          "step must be loop invariant");
8145 
8146   VPValue *Step =
8147       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8148   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8149     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,
8150                                              !NeedsScalarIVOnly);
8151   }
8152   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8153   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,
8154                                            !NeedsScalarIVOnly);
8155 }
8156 
8157 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
8158     PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
8159 
8160   // Check if this is an integer or fp induction. If so, build the recipe that
8161   // produces its scalar and vector values.
8162   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8163     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan,
8164                                        *PSE.getSE(), *OrigLoop, Range);
8165 
8166   // Check if this is pointer induction. If so, build the recipe for it.
8167   if (auto *II = Legal->getPointerInductionDescriptor(Phi))
8168     return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II,
8169                                              *PSE.getSE());
8170   return nullptr;
8171 }
8172 
8173 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8174     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8175   // Optimize the special case where the source is a constant integer
8176   // induction variable. Notice that we can only optimize the 'trunc' case
8177   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8178   // (c) other casts depend on pointer size.
8179 
8180   // Determine whether \p K is a truncation based on an induction variable that
8181   // can be optimized.
8182   auto isOptimizableIVTruncate =
8183       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8184     return [=](ElementCount VF) -> bool {
8185       return CM.isOptimizableIVTruncate(K, VF);
8186     };
8187   };
8188 
8189   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8190           isOptimizableIVTruncate(I), Range)) {
8191 
8192     auto *Phi = cast<PHINode>(I->getOperand(0));
8193     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8194     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8195     return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan,
8196                                        *PSE.getSE(), *OrigLoop, Range);
8197   }
8198   return nullptr;
8199 }
8200 
8201 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8202                                                 ArrayRef<VPValue *> Operands,
8203                                                 VPlanPtr &Plan) {
8204   // If all incoming values are equal, the incoming VPValue can be used directly
8205   // instead of creating a new VPBlendRecipe.
8206   VPValue *FirstIncoming = Operands[0];
8207   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8208         return FirstIncoming == Inc;
8209       })) {
8210     return Operands[0];
8211   }
8212 
8213   unsigned NumIncoming = Phi->getNumIncomingValues();
8214   // For in-loop reductions, we do not need to create an additional select.
8215   VPValue *InLoopVal = nullptr;
8216   for (unsigned In = 0; In < NumIncoming; In++) {
8217     PHINode *PhiOp =
8218         dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8219     if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8220       assert(!InLoopVal && "Found more than one in-loop reduction!");
8221       InLoopVal = Operands[In];
8222     }
8223   }
8224 
8225   assert((!InLoopVal || NumIncoming == 2) &&
8226          "Found an in-loop reduction for PHI with unexpected number of "
8227          "incoming values");
8228   if (InLoopVal)
8229     return Operands[Operands[0] == InLoopVal ? 1 : 0];
8230 
8231   // We know that all PHIs in non-header blocks are converted into selects, so
8232   // we don't have to worry about the insertion order and we can just use the
8233   // builder. At this point we generate the predication tree. There may be
8234   // duplications since this is a simple recursive scan, but future
8235   // optimizations will clean it up.
8236   SmallVector<VPValue *, 2> OperandsWithMask;
8237 
8238   for (unsigned In = 0; In < NumIncoming; In++) {
8239     VPValue *EdgeMask =
8240       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8241     assert((EdgeMask || NumIncoming == 1) &&
8242            "Multiple predecessors with one having a full mask");
8243     OperandsWithMask.push_back(Operands[In]);
8244     if (EdgeMask)
8245       OperandsWithMask.push_back(EdgeMask);
8246   }
8247   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8248 }
8249 
8250 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8251                                                    ArrayRef<VPValue *> Operands,
8252                                                    VFRange &Range) const {
8253 
8254   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8255       [this, CI](ElementCount VF) {
8256         return CM.isScalarWithPredication(CI, VF);
8257       },
8258       Range);
8259 
8260   if (IsPredicated)
8261     return nullptr;
8262 
8263   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8264   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8265              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8266              ID == Intrinsic::pseudoprobe ||
8267              ID == Intrinsic::experimental_noalias_scope_decl))
8268     return nullptr;
8269 
8270   auto willWiden = [&](ElementCount VF) -> bool {
8271     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8272     // The following case may be scalarized depending on the VF.
8273     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8274     // version of the instruction.
8275     // Is it beneficial to perform intrinsic call compared to lib call?
8276     bool NeedToScalarize = false;
8277     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8278     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8279     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8280     return UseVectorIntrinsic || !NeedToScalarize;
8281   };
8282 
8283   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8284     return nullptr;
8285 
8286   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8287   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8288 }
8289 
8290 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8291   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8292          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8293   // Instruction should be widened, unless it is scalar after vectorization,
8294   // scalarization is profitable or it is predicated.
8295   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8296     return CM.isScalarAfterVectorization(I, VF) ||
8297            CM.isProfitableToScalarize(I, VF) ||
8298            CM.isScalarWithPredication(I, VF);
8299   };
8300   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8301                                                              Range);
8302 }
8303 
8304 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8305                                            ArrayRef<VPValue *> Operands) const {
8306   auto IsVectorizableOpcode = [](unsigned Opcode) {
8307     switch (Opcode) {
8308     case Instruction::Add:
8309     case Instruction::And:
8310     case Instruction::AShr:
8311     case Instruction::BitCast:
8312     case Instruction::FAdd:
8313     case Instruction::FCmp:
8314     case Instruction::FDiv:
8315     case Instruction::FMul:
8316     case Instruction::FNeg:
8317     case Instruction::FPExt:
8318     case Instruction::FPToSI:
8319     case Instruction::FPToUI:
8320     case Instruction::FPTrunc:
8321     case Instruction::FRem:
8322     case Instruction::FSub:
8323     case Instruction::ICmp:
8324     case Instruction::IntToPtr:
8325     case Instruction::LShr:
8326     case Instruction::Mul:
8327     case Instruction::Or:
8328     case Instruction::PtrToInt:
8329     case Instruction::SDiv:
8330     case Instruction::Select:
8331     case Instruction::SExt:
8332     case Instruction::Shl:
8333     case Instruction::SIToFP:
8334     case Instruction::SRem:
8335     case Instruction::Sub:
8336     case Instruction::Trunc:
8337     case Instruction::UDiv:
8338     case Instruction::UIToFP:
8339     case Instruction::URem:
8340     case Instruction::Xor:
8341     case Instruction::ZExt:
8342     case Instruction::Freeze:
8343       return true;
8344     }
8345     return false;
8346   };
8347 
8348   if (!IsVectorizableOpcode(I->getOpcode()))
8349     return nullptr;
8350 
8351   // Success: widen this instruction.
8352   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8353 }
8354 
8355 void VPRecipeBuilder::fixHeaderPhis() {
8356   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8357   for (VPHeaderPHIRecipe *R : PhisToFix) {
8358     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8359     VPRecipeBase *IncR =
8360         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8361     R->addOperand(IncR->getVPSingleValue());
8362   }
8363 }
8364 
8365 VPBasicBlock *VPRecipeBuilder::handleReplication(
8366     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8367     VPlanPtr &Plan) {
8368   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8369       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8370       Range);
8371 
8372   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8373       [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },
8374       Range);
8375 
8376   // Even if the instruction is not marked as uniform, there are certain
8377   // intrinsic calls that can be effectively treated as such, so we check for
8378   // them here. Conservatively, we only do this for scalable vectors, since
8379   // for fixed-width VFs we can always fall back on full scalarization.
8380   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8381     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8382     case Intrinsic::assume:
8383     case Intrinsic::lifetime_start:
8384     case Intrinsic::lifetime_end:
8385       // For scalable vectors if one of the operands is variant then we still
8386       // want to mark as uniform, which will generate one instruction for just
8387       // the first lane of the vector. We can't scalarize the call in the same
8388       // way as for fixed-width vectors because we don't know how many lanes
8389       // there are.
8390       //
8391       // The reasons for doing it this way for scalable vectors are:
8392       //   1. For the assume intrinsic generating the instruction for the first
8393       //      lane is still be better than not generating any at all. For
8394       //      example, the input may be a splat across all lanes.
8395       //   2. For the lifetime start/end intrinsics the pointer operand only
8396       //      does anything useful when the input comes from a stack object,
8397       //      which suggests it should always be uniform. For non-stack objects
8398       //      the effect is to poison the object, which still allows us to
8399       //      remove the call.
8400       IsUniform = true;
8401       break;
8402     default:
8403       break;
8404     }
8405   }
8406 
8407   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8408                                        IsUniform, IsPredicated);
8409   setRecipe(I, Recipe);
8410   Plan->addVPValue(I, Recipe);
8411 
8412   // Find if I uses a predicated instruction. If so, it will use its scalar
8413   // value. Avoid hoisting the insert-element which packs the scalar value into
8414   // a vector value, as that happens iff all users use the vector value.
8415   for (VPValue *Op : Recipe->operands()) {
8416     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8417     if (!PredR)
8418       continue;
8419     auto *RepR =
8420         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8421     assert(RepR->isPredicated() &&
8422            "expected Replicate recipe to be predicated");
8423     RepR->setAlsoPack(false);
8424   }
8425 
8426   // Finalize the recipe for Instr, first if it is not predicated.
8427   if (!IsPredicated) {
8428     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8429     VPBB->appendRecipe(Recipe);
8430     return VPBB;
8431   }
8432   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8433 
8434   VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8435   assert(SingleSucc && "VPBB must have a single successor when handling "
8436                        "predicated replication.");
8437   VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8438   // Record predicated instructions for above packing optimizations.
8439   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8440   VPBlockUtils::insertBlockAfter(Region, VPBB);
8441   auto *RegSucc = new VPBasicBlock();
8442   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8443   VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8444   return RegSucc;
8445 }
8446 
8447 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(
8448     Instruction *Instr, VPReplicateRecipe *PredRecipe, VPlanPtr &Plan) {
8449   // Instructions marked for predication are replicated and placed under an
8450   // if-then construct to prevent side-effects.
8451 
8452   // Generate recipes to compute the block mask for this region.
8453   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8454 
8455   // Build the triangular if-then region.
8456   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8457   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8458   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8459   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8460   auto *PHIRecipe = Instr->getType()->isVoidTy()
8461                         ? nullptr
8462                         : new VPPredInstPHIRecipe(PredRecipe);
8463   if (PHIRecipe) {
8464     Plan->removeVPValueFor(Instr);
8465     Plan->addVPValue(Instr, PHIRecipe);
8466   }
8467   auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8468   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8469   VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
8470 
8471   // Note: first set Entry as region entry and then connect successors starting
8472   // from it in order, to propagate the "parent" of each VPBasicBlock.
8473   VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
8474   VPBlockUtils::connectBlocks(Pred, Exiting);
8475 
8476   return Region;
8477 }
8478 
8479 VPRecipeOrVPValueTy
8480 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8481                                         ArrayRef<VPValue *> Operands,
8482                                         VFRange &Range, VPlanPtr &Plan) {
8483   // First, check for specific widening recipes that deal with inductions, Phi
8484   // nodes, calls and memory operations.
8485   VPRecipeBase *Recipe;
8486   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8487     if (Phi->getParent() != OrigLoop->getHeader())
8488       return tryToBlend(Phi, Operands, Plan);
8489     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8490       return toVPRecipeResult(Recipe);
8491 
8492     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8493     assert((Legal->isReductionVariable(Phi) ||
8494             Legal->isFirstOrderRecurrence(Phi)) &&
8495            "can only widen reductions and first-order recurrences here");
8496     VPValue *StartV = Operands[0];
8497     if (Legal->isReductionVariable(Phi)) {
8498       const RecurrenceDescriptor &RdxDesc =
8499           Legal->getReductionVars().find(Phi)->second;
8500       assert(RdxDesc.getRecurrenceStartValue() ==
8501              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8502       PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8503                                            CM.isInLoopReduction(Phi),
8504                                            CM.useOrderedReductions(RdxDesc));
8505     } else {
8506       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8507     }
8508 
8509     // Record the incoming value from the backedge, so we can add the incoming
8510     // value from the backedge after all recipes have been created.
8511     recordRecipeOf(cast<Instruction>(
8512         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8513     PhisToFix.push_back(PhiRecipe);
8514     return toVPRecipeResult(PhiRecipe);
8515   }
8516 
8517   if (isa<TruncInst>(Instr) &&
8518       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8519                                                Range, *Plan)))
8520     return toVPRecipeResult(Recipe);
8521 
8522   // All widen recipes below deal only with VF > 1.
8523   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8524           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8525     return nullptr;
8526 
8527   if (auto *CI = dyn_cast<CallInst>(Instr))
8528     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8529 
8530   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8531     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8532 
8533   if (!shouldWiden(Instr, Range))
8534     return nullptr;
8535 
8536   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8537     return toVPRecipeResult(new VPWidenGEPRecipe(
8538         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8539 
8540   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8541     bool InvariantCond =
8542         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8543     return toVPRecipeResult(new VPWidenSelectRecipe(
8544         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8545   }
8546 
8547   return toVPRecipeResult(tryToWiden(Instr, Operands));
8548 }
8549 
8550 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8551                                                         ElementCount MaxVF) {
8552   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8553 
8554   // Add assume instructions we need to drop to DeadInstructions, to prevent
8555   // them from being added to the VPlan.
8556   // TODO: We only need to drop assumes in blocks that get flattend. If the
8557   // control flow is preserved, we should keep them.
8558   SmallPtrSet<Instruction *, 4> DeadInstructions;
8559   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8560   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8561 
8562   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8563   // Dead instructions do not need sinking. Remove them from SinkAfter.
8564   for (Instruction *I : DeadInstructions)
8565     SinkAfter.erase(I);
8566 
8567   // Cannot sink instructions after dead instructions (there won't be any
8568   // recipes for them). Instead, find the first non-dead previous instruction.
8569   for (auto &P : Legal->getSinkAfter()) {
8570     Instruction *SinkTarget = P.second;
8571     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8572     (void)FirstInst;
8573     while (DeadInstructions.contains(SinkTarget)) {
8574       assert(
8575           SinkTarget != FirstInst &&
8576           "Must find a live instruction (at least the one feeding the "
8577           "first-order recurrence PHI) before reaching beginning of the block");
8578       SinkTarget = SinkTarget->getPrevNode();
8579       assert(SinkTarget != P.first &&
8580              "sink source equals target, no sinking required");
8581     }
8582     P.second = SinkTarget;
8583   }
8584 
8585   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8586   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8587     VFRange SubRange = {VF, MaxVFPlusOne};
8588     VPlans.push_back(
8589         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8590     VF = SubRange.End;
8591   }
8592 }
8593 
8594 // Add the necessary canonical IV and branch recipes required to control the
8595 // loop.
8596 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8597                                   bool HasNUW,
8598                                   bool UseLaneMaskForLoopControlFlow) {
8599   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8600   auto *StartV = Plan.getOrAddVPValue(StartIdx);
8601 
8602   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8603   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8604   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8605   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8606   Header->insert(CanonicalIVPHI, Header->begin());
8607 
8608   // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
8609   // IV by VF * UF.
8610   auto *CanonicalIVIncrement =
8611       new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8612                                : VPInstruction::CanonicalIVIncrement,
8613                         {CanonicalIVPHI}, DL, "index.next");
8614   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8615 
8616   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
8617   EB->appendRecipe(CanonicalIVIncrement);
8618 
8619   if (UseLaneMaskForLoopControlFlow) {
8620     // Create the active lane mask instruction in the vplan preheader.
8621     VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();
8622 
8623     // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
8624     // we have to take unrolling into account. Each part needs to start at
8625     //   Part * VF
8626     auto *CanonicalIVIncrementParts =
8627         new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
8628                                  : VPInstruction::CanonicalIVIncrementForPart,
8629                           {StartV}, DL, "index.part.next");
8630     Preheader->appendRecipe(CanonicalIVIncrementParts);
8631 
8632     // Create the ActiveLaneMask instruction using the correct start values.
8633     VPValue *TC = Plan.getOrCreateTripCount();
8634     auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask,
8635                                        {CanonicalIVIncrementParts, TC}, DL,
8636                                        "active.lane.mask.entry");
8637     Preheader->appendRecipe(EntryALM);
8638 
8639     // Now create the ActiveLaneMaskPhi recipe in the main loop using the
8640     // preheader ActiveLaneMask instruction.
8641     auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
8642     Header->insert(LaneMaskPhi, Header->getFirstNonPhi());
8643 
8644     // Create the active lane mask for the next iteration of the loop.
8645     CanonicalIVIncrementParts =
8646         new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
8647                                  : VPInstruction::CanonicalIVIncrementForPart,
8648                           {CanonicalIVIncrement}, DL);
8649     EB->appendRecipe(CanonicalIVIncrementParts);
8650 
8651     auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask,
8652                                   {CanonicalIVIncrementParts, TC}, DL,
8653                                   "active.lane.mask.next");
8654     EB->appendRecipe(ALM);
8655     LaneMaskPhi->addOperand(ALM);
8656 
8657     // We have to invert the mask here because a true condition means jumping
8658     // to the exit block.
8659     auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL);
8660     EB->appendRecipe(NotMask);
8661 
8662     VPInstruction *BranchBack =
8663         new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL);
8664     EB->appendRecipe(BranchBack);
8665   } else {
8666     // Add the BranchOnCount VPInstruction to the latch.
8667     VPInstruction *BranchBack = new VPInstruction(
8668         VPInstruction::BranchOnCount,
8669         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8670     EB->appendRecipe(BranchBack);
8671   }
8672 }
8673 
8674 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8675 // original exit block.
8676 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
8677                                 VPBasicBlock *MiddleVPBB, Loop *OrigLoop,
8678                                 VPlan &Plan) {
8679   BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8680   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8681   // Only handle single-exit loops with unique exit blocks for now.
8682   if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8683     return;
8684 
8685   // Introduce VPUsers modeling the exit values.
8686   for (PHINode &ExitPhi : ExitBB->phis()) {
8687     Value *IncomingValue =
8688         ExitPhi.getIncomingValueForBlock(ExitingBB);
8689     VPValue *V = Plan.getOrAddVPValue(IncomingValue, true);
8690     Plan.addLiveOut(&ExitPhi, V);
8691   }
8692 }
8693 
8694 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8695     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8696     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8697 
8698   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8699 
8700   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8701 
8702   // ---------------------------------------------------------------------------
8703   // Pre-construction: record ingredients whose recipes we'll need to further
8704   // process after constructing the initial VPlan.
8705   // ---------------------------------------------------------------------------
8706 
8707   // Mark instructions we'll need to sink later and their targets as
8708   // ingredients whose recipe we'll need to record.
8709   for (auto &Entry : SinkAfter) {
8710     RecipeBuilder.recordRecipeOf(Entry.first);
8711     RecipeBuilder.recordRecipeOf(Entry.second);
8712   }
8713   for (auto &Reduction : CM.getInLoopReductionChains()) {
8714     PHINode *Phi = Reduction.first;
8715     RecurKind Kind =
8716         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8717     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8718 
8719     RecipeBuilder.recordRecipeOf(Phi);
8720     for (auto &R : ReductionOperations) {
8721       RecipeBuilder.recordRecipeOf(R);
8722       // For min/max reductions, where we have a pair of icmp/select, we also
8723       // need to record the ICmp recipe, so it can be removed later.
8724       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
8725              "Only min/max recurrences allowed for inloop reductions");
8726       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8727         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8728     }
8729   }
8730 
8731   // For each interleave group which is relevant for this (possibly trimmed)
8732   // Range, add it to the set of groups to be later applied to the VPlan and add
8733   // placeholders for its members' Recipes which we'll be replacing with a
8734   // single VPInterleaveRecipe.
8735   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8736     auto applyIG = [IG, this](ElementCount VF) -> bool {
8737       return (VF.isVector() && // Query is illegal for VF == 1
8738               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8739                   LoopVectorizationCostModel::CM_Interleave);
8740     };
8741     if (!getDecisionAndClampRange(applyIG, Range))
8742       continue;
8743     InterleaveGroups.insert(IG);
8744     for (unsigned i = 0; i < IG->getFactor(); i++)
8745       if (Instruction *Member = IG->getMember(i))
8746         RecipeBuilder.recordRecipeOf(Member);
8747   };
8748 
8749   // ---------------------------------------------------------------------------
8750   // Build initial VPlan: Scan the body of the loop in a topological order to
8751   // visit each basic block after having visited its predecessor basic blocks.
8752   // ---------------------------------------------------------------------------
8753 
8754   // Create initial VPlan skeleton, starting with a block for the pre-header,
8755   // followed by a region for the vector loop, followed by the middle block. The
8756   // skeleton vector loop region contains a header and latch block.
8757   VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
8758   auto Plan = std::make_unique<VPlan>(Preheader);
8759 
8760   VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8761   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8762   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8763   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
8764   VPBlockUtils::insertBlockAfter(TopRegion, Preheader);
8765   VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
8766   VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
8767 
8768   Instruction *DLInst =
8769       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8770   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
8771                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
8772                         !CM.foldTailByMasking(),
8773                         CM.useActiveLaneMaskForControlFlow());
8774 
8775   // Scan the body of the loop in a topological order to visit each basic block
8776   // after having visited its predecessor basic blocks.
8777   LoopBlocksDFS DFS(OrigLoop);
8778   DFS.perform(LI);
8779 
8780   VPBasicBlock *VPBB = HeaderVPBB;
8781   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
8782   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8783     // Relevant instructions from basic block BB will be grouped into VPRecipe
8784     // ingredients and fill a new VPBasicBlock.
8785     unsigned VPBBsForBB = 0;
8786     if (VPBB != HeaderVPBB)
8787       VPBB->setName(BB->getName());
8788     Builder.setInsertPoint(VPBB);
8789 
8790     // Introduce each ingredient into VPlan.
8791     // TODO: Model and preserve debug intrinsics in VPlan.
8792     for (Instruction &I : BB->instructionsWithoutDebug()) {
8793       Instruction *Instr = &I;
8794 
8795       // First filter out irrelevant instructions, to ensure no recipes are
8796       // built for them.
8797       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8798         continue;
8799 
8800       SmallVector<VPValue *, 4> Operands;
8801       auto *Phi = dyn_cast<PHINode>(Instr);
8802       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8803         Operands.push_back(Plan->getOrAddVPValue(
8804             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8805       } else {
8806         auto OpRange = Plan->mapToVPValues(Instr->operands());
8807         Operands = {OpRange.begin(), OpRange.end()};
8808       }
8809 
8810       // Invariant stores inside loop will be deleted and a single store
8811       // with the final reduction value will be added to the exit block
8812       StoreInst *SI;
8813       if ((SI = dyn_cast<StoreInst>(&I)) &&
8814           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8815         continue;
8816 
8817       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8818               Instr, Operands, Range, Plan)) {
8819         // If Instr can be simplified to an existing VPValue, use it.
8820         if (RecipeOrValue.is<VPValue *>()) {
8821           auto *VPV = RecipeOrValue.get<VPValue *>();
8822           Plan->addVPValue(Instr, VPV);
8823           // If the re-used value is a recipe, register the recipe for the
8824           // instruction, in case the recipe for Instr needs to be recorded.
8825           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
8826             RecipeBuilder.setRecipe(Instr, R);
8827           continue;
8828         }
8829         // Otherwise, add the new recipe.
8830         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
8831         for (auto *Def : Recipe->definedValues()) {
8832           auto *UV = Def->getUnderlyingValue();
8833           Plan->addVPValue(UV, Def);
8834         }
8835 
8836         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
8837             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
8838           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
8839           // of the header block. That can happen for truncates of induction
8840           // variables. Those recipes are moved to the phi section of the header
8841           // block after applying SinkAfter, which relies on the original
8842           // position of the trunc.
8843           assert(isa<TruncInst>(Instr));
8844           InductionsToMove.push_back(
8845               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
8846         }
8847         RecipeBuilder.setRecipe(Instr, Recipe);
8848         VPBB->appendRecipe(Recipe);
8849         continue;
8850       }
8851 
8852       // Otherwise, if all widening options failed, Instruction is to be
8853       // replicated. This may create a successor for VPBB.
8854       VPBasicBlock *NextVPBB =
8855           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
8856       if (NextVPBB != VPBB) {
8857         VPBB = NextVPBB;
8858         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8859                                     : "");
8860       }
8861     }
8862 
8863     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8864     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8865   }
8866 
8867   HeaderVPBB->setName("vector.body");
8868 
8869   // Fold the last, empty block into its predecessor.
8870   VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
8871   assert(VPBB && "expected to fold last (empty) block");
8872   // After here, VPBB should not be used.
8873   VPBB = nullptr;
8874 
8875   addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
8876 
8877   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8878          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8879          "entry block must be set to a VPRegionBlock having a non-empty entry "
8880          "VPBasicBlock");
8881   RecipeBuilder.fixHeaderPhis();
8882 
8883   // ---------------------------------------------------------------------------
8884   // Transform initial VPlan: Apply previously taken decisions, in order, to
8885   // bring the VPlan to its final state.
8886   // ---------------------------------------------------------------------------
8887 
8888   // Apply Sink-After legal constraints.
8889   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
8890     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
8891     if (Region && Region->isReplicator()) {
8892       assert(Region->getNumSuccessors() == 1 &&
8893              Region->getNumPredecessors() == 1 && "Expected SESE region!");
8894       assert(R->getParent()->size() == 1 &&
8895              "A recipe in an original replicator region must be the only "
8896              "recipe in its block");
8897       return Region;
8898     }
8899     return nullptr;
8900   };
8901   for (auto &Entry : SinkAfter) {
8902     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8903     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8904 
8905     auto *TargetRegion = GetReplicateRegion(Target);
8906     auto *SinkRegion = GetReplicateRegion(Sink);
8907     if (!SinkRegion) {
8908       // If the sink source is not a replicate region, sink the recipe directly.
8909       if (TargetRegion) {
8910         // The target is in a replication region, make sure to move Sink to
8911         // the block after it, not into the replication region itself.
8912         VPBasicBlock *NextBlock =
8913             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
8914         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
8915       } else
8916         Sink->moveAfter(Target);
8917       continue;
8918     }
8919 
8920     // The sink source is in a replicate region. Unhook the region from the CFG.
8921     auto *SinkPred = SinkRegion->getSinglePredecessor();
8922     auto *SinkSucc = SinkRegion->getSingleSuccessor();
8923     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
8924     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
8925     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
8926 
8927     if (TargetRegion) {
8928       // The target recipe is also in a replicate region, move the sink region
8929       // after the target region.
8930       auto *TargetSucc = TargetRegion->getSingleSuccessor();
8931       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
8932       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
8933       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
8934     } else {
8935       // The sink source is in a replicate region, we need to move the whole
8936       // replicate region, which should only contain a single recipe in the
8937       // main block.
8938       auto *SplitBlock =
8939           Target->getParent()->splitAt(std::next(Target->getIterator()));
8940 
8941       auto *SplitPred = SplitBlock->getSinglePredecessor();
8942 
8943       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
8944       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
8945       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
8946     }
8947   }
8948 
8949   VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
8950   VPlanTransforms::removeRedundantInductionCasts(*Plan);
8951 
8952   // Now that sink-after is done, move induction recipes for optimized truncates
8953   // to the phi section of the header block.
8954   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
8955     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8956 
8957   // Adjust the recipes for any inloop reductions.
8958   adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,
8959                              RecipeBuilder, Range.Start);
8960 
8961   // Introduce a recipe to combine the incoming and previous values of a
8962   // first-order recurrence.
8963   for (VPRecipeBase &R :
8964        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8965     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
8966     if (!RecurPhi)
8967       continue;
8968 
8969     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
8970     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
8971     auto *Region = GetReplicateRegion(PrevRecipe);
8972     if (Region)
8973       InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor());
8974     if (!InsertBlock) {
8975       InsertBlock = new VPBasicBlock(Region->getName() + ".succ");
8976       VPBlockUtils::insertBlockAfter(InsertBlock, Region);
8977     }
8978     if (Region || PrevRecipe->isPhi())
8979       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
8980     else
8981       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
8982 
8983     auto *RecurSplice = cast<VPInstruction>(
8984         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
8985                              {RecurPhi, RecurPhi->getBackedgeValue()}));
8986 
8987     RecurPhi->replaceAllUsesWith(RecurSplice);
8988     // Set the first operand of RecurSplice to RecurPhi again, after replacing
8989     // all users.
8990     RecurSplice->setOperand(0, RecurPhi);
8991   }
8992 
8993   // Interleave memory: for each Interleave Group we marked earlier as relevant
8994   // for this VPlan, replace the Recipes widening its memory instructions with a
8995   // single VPInterleaveRecipe at its insertion point.
8996   for (auto IG : InterleaveGroups) {
8997     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8998         RecipeBuilder.getRecipe(IG->getInsertPos()));
8999     SmallVector<VPValue *, 4> StoredValues;
9000     for (unsigned i = 0; i < IG->getFactor(); ++i)
9001       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9002         auto *StoreR =
9003             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9004         StoredValues.push_back(StoreR->getStoredValue());
9005       }
9006 
9007     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9008                                         Recipe->getMask());
9009     VPIG->insertBefore(Recipe);
9010     unsigned J = 0;
9011     for (unsigned i = 0; i < IG->getFactor(); ++i)
9012       if (Instruction *Member = IG->getMember(i)) {
9013         if (!Member->getType()->isVoidTy()) {
9014           VPValue *OriginalV = Plan->getVPValue(Member);
9015           Plan->removeVPValueFor(Member);
9016           Plan->addVPValue(Member, VPIG->getVPValue(J));
9017           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9018           J++;
9019         }
9020         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9021       }
9022   }
9023 
9024   std::string PlanName;
9025   raw_string_ostream RSO(PlanName);
9026   ElementCount VF = Range.Start;
9027   Plan->addVF(VF);
9028   RSO << "Initial VPlan for VF={" << VF;
9029   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9030     Plan->addVF(VF);
9031     RSO << "," << VF;
9032   }
9033   RSO << "},UF>=1";
9034   RSO.flush();
9035   Plan->setName(PlanName);
9036 
9037   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9038   // in ways that accessing values using original IR values is incorrect.
9039   Plan->disableValue2VPValue();
9040 
9041   VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
9042   VPlanTransforms::sinkScalarOperands(*Plan);
9043   VPlanTransforms::removeDeadRecipes(*Plan);
9044   VPlanTransforms::mergeReplicateRegions(*Plan);
9045   VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
9046 
9047   // Fold Exit block into its predecessor if possible.
9048   // TODO: Fold block earlier once all VPlan transforms properly maintain a
9049   // VPBasicBlock as exit.
9050   VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting());
9051 
9052   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9053   return Plan;
9054 }
9055 
9056 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9057   // Outer loop handling: They may require CFG and instruction level
9058   // transformations before even evaluating whether vectorization is profitable.
9059   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9060   // the vectorization pipeline.
9061   assert(!OrigLoop->isInnermost());
9062   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9063 
9064   // Create new empty VPlan
9065   auto Plan = std::make_unique<VPlan>();
9066 
9067   // Build hierarchical CFG
9068   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9069   HCFGBuilder.buildHierarchicalCFG();
9070 
9071   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9072        VF *= 2)
9073     Plan->addVF(VF);
9074 
9075   SmallPtrSet<Instruction *, 1> DeadInstructions;
9076   VPlanTransforms::VPInstructionsToVPRecipes(
9077       OrigLoop, Plan,
9078       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9079       DeadInstructions, *PSE.getSE());
9080 
9081   // Remove the existing terminator of the exiting block of the top-most region.
9082   // A BranchOnCount will be added instead when adding the canonical IV recipes.
9083   auto *Term =
9084       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9085   Term->eraseFromParent();
9086 
9087   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9088                         true, CM.useActiveLaneMaskForControlFlow());
9089   return Plan;
9090 }
9091 
9092 // Adjust the recipes for reductions. For in-loop reductions the chain of
9093 // instructions leading from the loop exit instr to the phi need to be converted
9094 // to reductions, with one operand being vector and the other being the scalar
9095 // reduction chain. For other reductions, a select is introduced between the phi
9096 // and live-out recipes when folding the tail.
9097 void LoopVectorizationPlanner::adjustRecipesForReductions(
9098     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9099     ElementCount MinVF) {
9100   for (auto &Reduction : CM.getInLoopReductionChains()) {
9101     PHINode *Phi = Reduction.first;
9102     const RecurrenceDescriptor &RdxDesc =
9103         Legal->getReductionVars().find(Phi)->second;
9104     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9105 
9106     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9107       continue;
9108 
9109     // ReductionOperations are orders top-down from the phi's use to the
9110     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9111     // which of the two operands will remain scalar and which will be reduced.
9112     // For minmax the chain will be the select instructions.
9113     Instruction *Chain = Phi;
9114     for (Instruction *R : ReductionOperations) {
9115       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9116       RecurKind Kind = RdxDesc.getRecurrenceKind();
9117 
9118       VPValue *ChainOp = Plan->getVPValue(Chain);
9119       unsigned FirstOpId;
9120       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9121              "Only min/max recurrences allowed for inloop reductions");
9122       // Recognize a call to the llvm.fmuladd intrinsic.
9123       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9124       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9125              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9126       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9127         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9128                "Expected to replace a VPWidenSelectSC");
9129         FirstOpId = 1;
9130       } else {
9131         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9132                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9133                "Expected to replace a VPWidenSC");
9134         FirstOpId = 0;
9135       }
9136       unsigned VecOpId =
9137           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9138       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9139 
9140       auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent())
9141                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9142                          : nullptr;
9143 
9144       if (IsFMulAdd) {
9145         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9146         // need to create an fmul recipe to use as the vector operand for the
9147         // fadd reduction.
9148         VPInstruction *FMulRecipe = new VPInstruction(
9149             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9150         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9151         WidenRecipe->getParent()->insert(FMulRecipe,
9152                                          WidenRecipe->getIterator());
9153         VecOp = FMulRecipe;
9154       }
9155       VPReductionRecipe *RedRecipe =
9156           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9157       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9158       Plan->removeVPValueFor(R);
9159       Plan->addVPValue(R, RedRecipe);
9160       // Append the recipe to the end of the VPBasicBlock because we need to
9161       // ensure that it comes after all of it's inputs, including CondOp.
9162       WidenRecipe->getParent()->appendRecipe(RedRecipe);
9163       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9164       WidenRecipe->eraseFromParent();
9165 
9166       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9167         VPRecipeBase *CompareRecipe =
9168             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9169         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9170                "Expected to replace a VPWidenSC");
9171         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9172                "Expected no remaining users");
9173         CompareRecipe->eraseFromParent();
9174       }
9175       Chain = R;
9176     }
9177   }
9178 
9179   // If tail is folded by masking, introduce selects between the phi
9180   // and the live-out instruction of each reduction, at the beginning of the
9181   // dedicated latch block.
9182   if (CM.foldTailByMasking()) {
9183     Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9184     for (VPRecipeBase &R :
9185          Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9186       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9187       if (!PhiR || PhiR->isInLoop())
9188         continue;
9189       VPValue *Cond =
9190           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9191       VPValue *Red = PhiR->getBackedgeValue();
9192       assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB &&
9193              "reduction recipe must be defined before latch");
9194       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9195     }
9196   }
9197 }
9198 
9199 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9200 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9201                                VPSlotTracker &SlotTracker) const {
9202   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9203   IG->getInsertPos()->printAsOperand(O, false);
9204   O << ", ";
9205   getAddr()->printAsOperand(O, SlotTracker);
9206   VPValue *Mask = getMask();
9207   if (Mask) {
9208     O << ", ";
9209     Mask->printAsOperand(O, SlotTracker);
9210   }
9211 
9212   unsigned OpIdx = 0;
9213   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9214     if (!IG->getMember(i))
9215       continue;
9216     if (getNumStoreOperands() > 0) {
9217       O << "\n" << Indent << "  store ";
9218       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9219       O << " to index " << i;
9220     } else {
9221       O << "\n" << Indent << "  ";
9222       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9223       O << " = load from index " << i;
9224     }
9225     ++OpIdx;
9226   }
9227 }
9228 #endif
9229 
9230 void VPWidenCallRecipe::execute(VPTransformState &State) {
9231   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9232                                   *this, State);
9233 }
9234 
9235 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9236   assert(!State.Instance && "Int or FP induction being replicated.");
9237 
9238   Value *Start = getStartValue()->getLiveInIRValue();
9239   const InductionDescriptor &ID = getInductionDescriptor();
9240   TruncInst *Trunc = getTruncInst();
9241   IRBuilderBase &Builder = State.Builder;
9242   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
9243   assert(State.VF.isVector() && "must have vector VF");
9244 
9245   // The value from the original loop to which we are mapping the new induction
9246   // variable.
9247   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
9248 
9249   // Fast-math-flags propagate from the original induction instruction.
9250   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9251   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
9252     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
9253 
9254   // Now do the actual transformations, and start with fetching the step value.
9255   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9256 
9257   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
9258          "Expected either an induction phi-node or a truncate of it!");
9259 
9260   // Construct the initial value of the vector IV in the vector loop preheader
9261   auto CurrIP = Builder.saveIP();
9262   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9263   Builder.SetInsertPoint(VectorPH->getTerminator());
9264   if (isa<TruncInst>(EntryVal)) {
9265     assert(Start->getType()->isIntegerTy() &&
9266            "Truncation requires an integer type");
9267     auto *TruncType = cast<IntegerType>(EntryVal->getType());
9268     Step = Builder.CreateTrunc(Step, TruncType);
9269     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
9270   }
9271 
9272   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
9273   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
9274   Value *SteppedStart = getStepVector(
9275       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
9276 
9277   // We create vector phi nodes for both integer and floating-point induction
9278   // variables. Here, we determine the kind of arithmetic we will perform.
9279   Instruction::BinaryOps AddOp;
9280   Instruction::BinaryOps MulOp;
9281   if (Step->getType()->isIntegerTy()) {
9282     AddOp = Instruction::Add;
9283     MulOp = Instruction::Mul;
9284   } else {
9285     AddOp = ID.getInductionOpcode();
9286     MulOp = Instruction::FMul;
9287   }
9288 
9289   // Multiply the vectorization factor by the step using integer or
9290   // floating-point arithmetic as appropriate.
9291   Type *StepType = Step->getType();
9292   Value *RuntimeVF;
9293   if (Step->getType()->isFloatingPointTy())
9294     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
9295   else
9296     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
9297   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
9298 
9299   // Create a vector splat to use in the induction update.
9300   //
9301   // FIXME: If the step is non-constant, we create the vector splat with
9302   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
9303   //        handle a constant vector splat.
9304   Value *SplatVF = isa<Constant>(Mul)
9305                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
9306                        : Builder.CreateVectorSplat(State.VF, Mul);
9307   Builder.restoreIP(CurrIP);
9308 
9309   // We may need to add the step a number of times, depending on the unroll
9310   // factor. The last of those goes into the PHI.
9311   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
9312                                     &*State.CFG.PrevBB->getFirstInsertionPt());
9313   VecInd->setDebugLoc(EntryVal->getDebugLoc());
9314   Instruction *LastInduction = VecInd;
9315   for (unsigned Part = 0; Part < State.UF; ++Part) {
9316     State.set(this, LastInduction, Part);
9317 
9318     if (isa<TruncInst>(EntryVal))
9319       State.addMetadata(LastInduction, EntryVal);
9320 
9321     LastInduction = cast<Instruction>(
9322         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
9323     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
9324   }
9325 
9326   LastInduction->setName("vec.ind.next");
9327   VecInd->addIncoming(SteppedStart, VectorPH);
9328   // Add induction update using an incorrect block temporarily. The phi node
9329   // will be fixed after VPlan execution. Note that at this point the latch
9330   // block cannot be used, as it does not exist yet.
9331   // TODO: Model increment value in VPlan, by turning the recipe into a
9332   // multi-def and a subclass of VPHeaderPHIRecipe.
9333   VecInd->addIncoming(LastInduction, VectorPH);
9334 }
9335 
9336 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9337   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9338          "Not a pointer induction according to InductionDescriptor!");
9339   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9340          "Unexpected type.");
9341 
9342   auto *IVR = getParent()->getPlan()->getCanonicalIV();
9343   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9344 
9345   if (onlyScalarsGenerated(State.VF)) {
9346     // This is the normalized GEP that starts counting at zero.
9347     Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9348         CanonicalIV, IndDesc.getStep()->getType());
9349     // Determine the number of scalars we need to generate for each unroll
9350     // iteration. If the instruction is uniform, we only need to generate the
9351     // first lane. Otherwise, we generate all VF values.
9352     bool IsUniform = vputils::onlyFirstLaneUsed(this);
9353     assert((IsUniform || !State.VF.isScalable()) &&
9354            "Cannot scalarize a scalable VF");
9355     unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9356 
9357     for (unsigned Part = 0; Part < State.UF; ++Part) {
9358       Value *PartStart =
9359           createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9360 
9361       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9362         Value *Idx = State.Builder.CreateAdd(
9363             PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9364         Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9365 
9366         Value *Step = CreateStepValue(IndDesc.getStep(), SE,
9367                                       State.CFG.PrevBB->getTerminator());
9368         Value *SclrGep = emitTransformedIndex(
9369             State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
9370         SclrGep->setName("next.gep");
9371         State.set(this, SclrGep, VPIteration(Part, Lane));
9372       }
9373     }
9374     return;
9375   }
9376 
9377   assert(isa<SCEVConstant>(IndDesc.getStep()) &&
9378          "Induction step not a SCEV constant!");
9379   Type *PhiType = IndDesc.getStep()->getType();
9380 
9381   // Build a pointer phi
9382   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9383   Type *ScStValueType = ScalarStartValue->getType();
9384   PHINode *NewPointerPhi =
9385       PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9386 
9387   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9388   NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9389 
9390   // A pointer induction, performed by using a gep
9391   const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout();
9392   Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
9393 
9394   const SCEV *ScalarStep = IndDesc.getStep();
9395   SCEVExpander Exp(SE, DL, "induction");
9396   Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
9397   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9398   Value *NumUnrolledElems =
9399       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9400   Value *InductionGEP = GetElementPtrInst::Create(
9401       IndDesc.getElementType(), NewPointerPhi,
9402       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9403       InductionLoc);
9404   // Add induction update using an incorrect block temporarily. The phi node
9405   // will be fixed after VPlan execution. Note that at this point the latch
9406   // block cannot be used, as it does not exist yet.
9407   // TODO: Model increment value in VPlan, by turning the recipe into a
9408   // multi-def and a subclass of VPHeaderPHIRecipe.
9409   NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9410 
9411   // Create UF many actual address geps that use the pointer
9412   // phi as base and a vectorized version of the step value
9413   // (<step*0, ..., step*N>) as offset.
9414   for (unsigned Part = 0; Part < State.UF; ++Part) {
9415     Type *VecPhiType = VectorType::get(PhiType, State.VF);
9416     Value *StartOffsetScalar =
9417         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9418     Value *StartOffset =
9419         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9420     // Create a vector of consecutive numbers from zero to VF.
9421     StartOffset = State.Builder.CreateAdd(
9422         StartOffset, State.Builder.CreateStepVector(VecPhiType));
9423 
9424     Value *GEP = State.Builder.CreateGEP(
9425         IndDesc.getElementType(), NewPointerPhi,
9426         State.Builder.CreateMul(
9427             StartOffset,
9428             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9429             "vector.gep"));
9430     State.set(this, GEP, Part);
9431   }
9432 }
9433 
9434 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
9435   assert(!State.Instance && "VPScalarIVStepsRecipe being replicated.");
9436 
9437   // Fast-math-flags propagate from the original induction instruction.
9438   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9439   if (IndDesc.getInductionBinOp() &&
9440       isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9441     State.Builder.setFastMathFlags(
9442         IndDesc.getInductionBinOp()->getFastMathFlags());
9443 
9444   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9445   auto CreateScalarIV = [&](Value *&Step) -> Value * {
9446     Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9447     auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
9448     if (!isCanonical() || CanonicalIV->getType() != Ty) {
9449       ScalarIV =
9450           Ty->isIntegerTy()
9451               ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty)
9452               : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty);
9453       ScalarIV = emitTransformedIndex(State.Builder, ScalarIV,
9454                                       getStartValue()->getLiveInIRValue(), Step,
9455                                       IndDesc);
9456       ScalarIV->setName("offset.idx");
9457     }
9458     if (TruncToTy) {
9459       assert(Step->getType()->isIntegerTy() &&
9460              "Truncation requires an integer step");
9461       ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy);
9462       Step = State.Builder.CreateTrunc(Step, TruncToTy);
9463     }
9464     return ScalarIV;
9465   };
9466 
9467   Value *ScalarIV = CreateScalarIV(Step);
9468   if (State.VF.isVector()) {
9469     buildScalarSteps(ScalarIV, Step, IndDesc, this, State);
9470     return;
9471   }
9472 
9473   for (unsigned Part = 0; Part < State.UF; ++Part) {
9474     assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
9475     Value *EntryPart;
9476     if (Step->getType()->isFloatingPointTy()) {
9477       Value *StartIdx =
9478           getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part);
9479       // Floating-point operations inherit FMF via the builder's flags.
9480       Value *MulOp = State.Builder.CreateFMul(StartIdx, Step);
9481       EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(),
9482                                             ScalarIV, MulOp);
9483     } else {
9484       Value *StartIdx =
9485           getRuntimeVF(State.Builder, Step->getType(), State.VF * Part);
9486       EntryPart = State.Builder.CreateAdd(
9487           ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction");
9488     }
9489     State.set(this, EntryPart, Part);
9490   }
9491 }
9492 
9493 void VPInterleaveRecipe::execute(VPTransformState &State) {
9494   assert(!State.Instance && "Interleave group being replicated.");
9495   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9496                                       getStoredValues(), getMask());
9497 }
9498 
9499 void VPReductionRecipe::execute(VPTransformState &State) {
9500   assert(!State.Instance && "Reduction being replicated.");
9501   Value *PrevInChain = State.get(getChainOp(), 0);
9502   RecurKind Kind = RdxDesc->getRecurrenceKind();
9503   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9504   // Propagate the fast-math flags carried by the underlying instruction.
9505   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9506   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9507   for (unsigned Part = 0; Part < State.UF; ++Part) {
9508     Value *NewVecOp = State.get(getVecOp(), Part);
9509     if (VPValue *Cond = getCondOp()) {
9510       Value *NewCond = State.get(Cond, Part);
9511       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9512       Value *Iden = RdxDesc->getRecurrenceIdentity(
9513           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9514       Value *IdenVec =
9515           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9516       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9517       NewVecOp = Select;
9518     }
9519     Value *NewRed;
9520     Value *NextInChain;
9521     if (IsOrdered) {
9522       if (State.VF.isVector())
9523         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9524                                         PrevInChain);
9525       else
9526         NewRed = State.Builder.CreateBinOp(
9527             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9528             NewVecOp);
9529       PrevInChain = NewRed;
9530     } else {
9531       PrevInChain = State.get(getChainOp(), Part);
9532       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9533     }
9534     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9535       NextInChain =
9536           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9537                          NewRed, PrevInChain);
9538     } else if (IsOrdered)
9539       NextInChain = NewRed;
9540     else
9541       NextInChain = State.Builder.CreateBinOp(
9542           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9543           PrevInChain);
9544     State.set(this, NextInChain, Part);
9545   }
9546 }
9547 
9548 void VPReplicateRecipe::execute(VPTransformState &State) {
9549   if (State.Instance) { // Generate a single instance.
9550     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9551     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
9552                                     IsPredicated, State);
9553     // Insert scalar instance packing it into a vector.
9554     if (AlsoPack && State.VF.isVector()) {
9555       // If we're constructing lane 0, initialize to start from poison.
9556       if (State.Instance->Lane.isFirstLane()) {
9557         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9558         Value *Poison = PoisonValue::get(
9559             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9560         State.set(this, Poison, State.Instance->Part);
9561       }
9562       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9563     }
9564     return;
9565   }
9566 
9567   // Generate scalar instances for all VF lanes of all UF parts, unless the
9568   // instruction is uniform inwhich case generate only the first lane for each
9569   // of the UF parts.
9570   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9571   assert((!State.VF.isScalable() || IsUniform) &&
9572          "Can't scalarize a scalable vector");
9573   for (unsigned Part = 0; Part < State.UF; ++Part)
9574     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9575       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9576                                       VPIteration(Part, Lane), IsPredicated,
9577                                       State);
9578 }
9579 
9580 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9581   assert(State.Instance && "Predicated instruction PHI works per instance.");
9582   Instruction *ScalarPredInst =
9583       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9584   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9585   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9586   assert(PredicatingBB && "Predicated block has no single predecessor.");
9587   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9588          "operand must be VPReplicateRecipe");
9589 
9590   // By current pack/unpack logic we need to generate only a single phi node: if
9591   // a vector value for the predicated instruction exists at this point it means
9592   // the instruction has vector users only, and a phi for the vector value is
9593   // needed. In this case the recipe of the predicated instruction is marked to
9594   // also do that packing, thereby "hoisting" the insert-element sequence.
9595   // Otherwise, a phi node for the scalar value is needed.
9596   unsigned Part = State.Instance->Part;
9597   if (State.hasVectorValue(getOperand(0), Part)) {
9598     Value *VectorValue = State.get(getOperand(0), Part);
9599     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9600     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9601     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9602     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9603     if (State.hasVectorValue(this, Part))
9604       State.reset(this, VPhi, Part);
9605     else
9606       State.set(this, VPhi, Part);
9607     // NOTE: Currently we need to update the value of the operand, so the next
9608     // predicated iteration inserts its generated value in the correct vector.
9609     State.reset(getOperand(0), VPhi, Part);
9610   } else {
9611     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9612     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9613     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9614                      PredicatingBB);
9615     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9616     if (State.hasScalarValue(this, *State.Instance))
9617       State.reset(this, Phi, *State.Instance);
9618     else
9619       State.set(this, Phi, *State.Instance);
9620     // NOTE: Currently we need to update the value of the operand, so the next
9621     // predicated iteration inserts its generated value in the correct vector.
9622     State.reset(getOperand(0), Phi, *State.Instance);
9623   }
9624 }
9625 
9626 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9627   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9628 
9629   // Attempt to issue a wide load.
9630   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9631   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9632 
9633   assert((LI || SI) && "Invalid Load/Store instruction");
9634   assert((!SI || StoredValue) && "No stored value provided for widened store");
9635   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9636 
9637   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9638 
9639   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9640   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9641   bool CreateGatherScatter = !Consecutive;
9642 
9643   auto &Builder = State.Builder;
9644   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9645   bool isMaskRequired = getMask();
9646   if (isMaskRequired)
9647     for (unsigned Part = 0; Part < State.UF; ++Part)
9648       BlockInMaskParts[Part] = State.get(getMask(), Part);
9649 
9650   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9651     // Calculate the pointer for the specific unroll-part.
9652     GetElementPtrInst *PartPtr = nullptr;
9653 
9654     bool InBounds = false;
9655     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9656       InBounds = gep->isInBounds();
9657     if (Reverse) {
9658       // If the address is consecutive but reversed, then the
9659       // wide store needs to start at the last vector element.
9660       // RunTimeVF =  VScale * VF.getKnownMinValue()
9661       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9662       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9663       // NumElt = -Part * RunTimeVF
9664       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9665       // LastLane = 1 - RunTimeVF
9666       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9667       PartPtr =
9668           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9669       PartPtr->setIsInBounds(InBounds);
9670       PartPtr = cast<GetElementPtrInst>(
9671           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9672       PartPtr->setIsInBounds(InBounds);
9673       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9674         BlockInMaskParts[Part] =
9675             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9676     } else {
9677       Value *Increment =
9678           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9679       PartPtr = cast<GetElementPtrInst>(
9680           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9681       PartPtr->setIsInBounds(InBounds);
9682     }
9683 
9684     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9685     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9686   };
9687 
9688   // Handle Stores:
9689   if (SI) {
9690     State.setDebugLocFromInst(SI);
9691 
9692     for (unsigned Part = 0; Part < State.UF; ++Part) {
9693       Instruction *NewSI = nullptr;
9694       Value *StoredVal = State.get(StoredValue, Part);
9695       if (CreateGatherScatter) {
9696         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9697         Value *VectorGep = State.get(getAddr(), Part);
9698         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9699                                             MaskPart);
9700       } else {
9701         if (Reverse) {
9702           // If we store to reverse consecutive memory locations, then we need
9703           // to reverse the order of elements in the stored value.
9704           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9705           // We don't want to update the value in the map as it might be used in
9706           // another expression. So don't call resetVectorValue(StoredVal).
9707         }
9708         auto *VecPtr =
9709             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9710         if (isMaskRequired)
9711           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9712                                             BlockInMaskParts[Part]);
9713         else
9714           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9715       }
9716       State.addMetadata(NewSI, SI);
9717     }
9718     return;
9719   }
9720 
9721   // Handle loads.
9722   assert(LI && "Must have a load instruction");
9723   State.setDebugLocFromInst(LI);
9724   for (unsigned Part = 0; Part < State.UF; ++Part) {
9725     Value *NewLI;
9726     if (CreateGatherScatter) {
9727       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9728       Value *VectorGep = State.get(getAddr(), Part);
9729       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9730                                          nullptr, "wide.masked.gather");
9731       State.addMetadata(NewLI, LI);
9732     } else {
9733       auto *VecPtr =
9734           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9735       if (isMaskRequired)
9736         NewLI = Builder.CreateMaskedLoad(
9737             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9738             PoisonValue::get(DataTy), "wide.masked.load");
9739       else
9740         NewLI =
9741             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9742 
9743       // Add metadata to the load, but setVectorValue to the reverse shuffle.
9744       State.addMetadata(NewLI, LI);
9745       if (Reverse)
9746         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9747     }
9748 
9749     State.set(getVPSingleValue(), NewLI, Part);
9750   }
9751 }
9752 
9753 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9754 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9755 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9756 // for predication.
9757 static ScalarEpilogueLowering getScalarEpilogueLowering(
9758     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9759     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9760     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
9761     LoopVectorizationLegality &LVL) {
9762   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9763   // don't look at hints or options, and don't request a scalar epilogue.
9764   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9765   // LoopAccessInfo (due to code dependency and not being able to reliably get
9766   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9767   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9768   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9769   // back to the old way and vectorize with versioning when forced. See D81345.)
9770   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9771                                                       PGSOQueryType::IRPass) &&
9772                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9773     return CM_ScalarEpilogueNotAllowedOptSize;
9774 
9775   // 2) If set, obey the directives
9776   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9777     switch (PreferPredicateOverEpilogue) {
9778     case PreferPredicateTy::ScalarEpilogue:
9779       return CM_ScalarEpilogueAllowed;
9780     case PreferPredicateTy::PredicateElseScalarEpilogue:
9781       return CM_ScalarEpilogueNotNeededUsePredicate;
9782     case PreferPredicateTy::PredicateOrDontVectorize:
9783       return CM_ScalarEpilogueNotAllowedUsePredicate;
9784     };
9785   }
9786 
9787   // 3) If set, obey the hints
9788   switch (Hints.getPredicate()) {
9789   case LoopVectorizeHints::FK_Enabled:
9790     return CM_ScalarEpilogueNotNeededUsePredicate;
9791   case LoopVectorizeHints::FK_Disabled:
9792     return CM_ScalarEpilogueAllowed;
9793   };
9794 
9795   // 4) if the TTI hook indicates this is profitable, request predication.
9796   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
9797                                        LVL.getLAI()))
9798     return CM_ScalarEpilogueNotNeededUsePredicate;
9799 
9800   return CM_ScalarEpilogueAllowed;
9801 }
9802 
9803 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
9804   // If Values have been set for this Def return the one relevant for \p Part.
9805   if (hasVectorValue(Def, Part))
9806     return Data.PerPartOutput[Def][Part];
9807 
9808   if (!hasScalarValue(Def, {Part, 0})) {
9809     Value *IRV = Def->getLiveInIRValue();
9810     Value *B = ILV->getBroadcastInstrs(IRV);
9811     set(Def, B, Part);
9812     return B;
9813   }
9814 
9815   Value *ScalarValue = get(Def, {Part, 0});
9816   // If we aren't vectorizing, we can just copy the scalar map values over
9817   // to the vector map.
9818   if (VF.isScalar()) {
9819     set(Def, ScalarValue, Part);
9820     return ScalarValue;
9821   }
9822 
9823   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
9824   bool IsUniform = RepR && RepR->isUniform();
9825 
9826   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
9827   // Check if there is a scalar value for the selected lane.
9828   if (!hasScalarValue(Def, {Part, LastLane})) {
9829     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
9830     assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) ||
9831             isa<VPScalarIVStepsRecipe>(Def->getDef())) &&
9832            "unexpected recipe found to be invariant");
9833     IsUniform = true;
9834     LastLane = 0;
9835   }
9836 
9837   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
9838   // Set the insert point after the last scalarized instruction or after the
9839   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
9840   // will directly follow the scalar definitions.
9841   auto OldIP = Builder.saveIP();
9842   auto NewIP =
9843       isa<PHINode>(LastInst)
9844           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
9845           : std::next(BasicBlock::iterator(LastInst));
9846   Builder.SetInsertPoint(&*NewIP);
9847 
9848   // However, if we are vectorizing, we need to construct the vector values.
9849   // If the value is known to be uniform after vectorization, we can just
9850   // broadcast the scalar value corresponding to lane zero for each unroll
9851   // iteration. Otherwise, we construct the vector values using
9852   // insertelement instructions. Since the resulting vectors are stored in
9853   // State, we will only generate the insertelements once.
9854   Value *VectorValue = nullptr;
9855   if (IsUniform) {
9856     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
9857     set(Def, VectorValue, Part);
9858   } else {
9859     // Initialize packing with insertelements to start from undef.
9860     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
9861     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
9862     set(Def, Undef, Part);
9863     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
9864       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
9865     VectorValue = get(Def, Part);
9866   }
9867   Builder.restoreIP(OldIP);
9868   return VectorValue;
9869 }
9870 
9871 // Process the loop in the VPlan-native vectorization path. This path builds
9872 // VPlan upfront in the vectorization pipeline, which allows to apply
9873 // VPlan-to-VPlan transformations from the very beginning without modifying the
9874 // input LLVM IR.
9875 static bool processLoopInVPlanNativePath(
9876     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9877     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9878     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9879     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9880     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9881     LoopVectorizationRequirements &Requirements) {
9882 
9883   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9884     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9885     return false;
9886   }
9887   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9888   Function *F = L->getHeader()->getParent();
9889   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9890 
9891   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9892       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
9893 
9894   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9895                                 &Hints, IAI);
9896   // Use the planner for outer loop vectorization.
9897   // TODO: CM is not used at this point inside the planner. Turn CM into an
9898   // optional argument if we don't need it in the future.
9899   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE);
9900 
9901   // Get user vectorization factor.
9902   ElementCount UserVF = Hints.getWidth();
9903 
9904   CM.collectElementTypesForWidening();
9905 
9906   // Plan how to best vectorize, return the best VF and its cost.
9907   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9908 
9909   // If we are stress testing VPlan builds, do not attempt to generate vector
9910   // code. Masked vector code generation support will follow soon.
9911   // Also, do not attempt to vectorize if no vector code will be produced.
9912   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9913     return false;
9914 
9915   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9916 
9917   {
9918     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9919                              F->getParent()->getDataLayout());
9920     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9921                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9922     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9923                       << L->getHeader()->getParent()->getName() << "\"\n");
9924     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9925   }
9926 
9927   // Mark the loop as already vectorized to avoid vectorizing again.
9928   Hints.setAlreadyVectorized();
9929   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9930   return true;
9931 }
9932 
9933 // Emit a remark if there are stores to floats that required a floating point
9934 // extension. If the vectorized loop was generated with floating point there
9935 // will be a performance penalty from the conversion overhead and the change in
9936 // the vector width.
9937 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9938   SmallVector<Instruction *, 4> Worklist;
9939   for (BasicBlock *BB : L->getBlocks()) {
9940     for (Instruction &Inst : *BB) {
9941       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9942         if (S->getValueOperand()->getType()->isFloatTy())
9943           Worklist.push_back(S);
9944       }
9945     }
9946   }
9947 
9948   // Traverse the floating point stores upwards searching, for floating point
9949   // conversions.
9950   SmallPtrSet<const Instruction *, 4> Visited;
9951   SmallPtrSet<const Instruction *, 4> EmittedRemark;
9952   while (!Worklist.empty()) {
9953     auto *I = Worklist.pop_back_val();
9954     if (!L->contains(I))
9955       continue;
9956     if (!Visited.insert(I).second)
9957       continue;
9958 
9959     // Emit a remark if the floating point store required a floating
9960     // point conversion.
9961     // TODO: More work could be done to identify the root cause such as a
9962     // constant or a function return type and point the user to it.
9963     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9964       ORE->emit([&]() {
9965         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9966                                           I->getDebugLoc(), L->getHeader())
9967                << "floating point conversion changes vector width. "
9968                << "Mixed floating point precision requires an up/down "
9969                << "cast that will negatively impact performance.";
9970       });
9971 
9972     for (Use &Op : I->operands())
9973       if (auto *OpI = dyn_cast<Instruction>(Op))
9974         Worklist.push_back(OpI);
9975   }
9976 }
9977 
9978 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9979                                        VectorizationFactor &VF,
9980                                        Optional<unsigned> VScale, Loop *L,
9981                                        ScalarEvolution &SE) {
9982   InstructionCost CheckCost = Checks.getCost();
9983   if (!CheckCost.isValid())
9984     return false;
9985 
9986   // When interleaving only scalar and vector cost will be equal, which in turn
9987   // would lead to a divide by 0. Fall back to hard threshold.
9988   if (VF.Width.isScalar()) {
9989     if (CheckCost > VectorizeMemoryCheckThreshold) {
9990       LLVM_DEBUG(
9991           dbgs()
9992           << "LV: Interleaving only is not profitable due to runtime checks\n");
9993       return false;
9994     }
9995     return true;
9996   }
9997 
9998   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9999   double ScalarC = *VF.ScalarCost.getValue();
10000   if (ScalarC == 0)
10001     return true;
10002 
10003   // First, compute the minimum iteration count required so that the vector
10004   // loop outperforms the scalar loop.
10005   //  The total cost of the scalar loop is
10006   //   ScalarC * TC
10007   //  where
10008   //  * TC is the actual trip count of the loop.
10009   //  * ScalarC is the cost of a single scalar iteration.
10010   //
10011   //  The total cost of the vector loop is
10012   //    RtC + VecC * (TC / VF) + EpiC
10013   //  where
10014   //  * RtC is the cost of the generated runtime checks
10015   //  * VecC is the cost of a single vector iteration.
10016   //  * TC is the actual trip count of the loop
10017   //  * VF is the vectorization factor
10018   //  * EpiCost is the cost of the generated epilogue, including the cost
10019   //    of the remaining scalar operations.
10020   //
10021   // Vectorization is profitable once the total vector cost is less than the
10022   // total scalar cost:
10023   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
10024   //
10025   // Now we can compute the minimum required trip count TC as
10026   //   (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
10027   //
10028   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
10029   // the computations are performed on doubles, not integers and the result
10030   // is rounded up, hence we get an upper estimate of the TC.
10031   unsigned IntVF = VF.Width.getKnownMinValue();
10032   if (VF.Width.isScalable()) {
10033     unsigned AssumedMinimumVscale = 1;
10034     if (VScale)
10035       AssumedMinimumVscale = *VScale;
10036     IntVF *= AssumedMinimumVscale;
10037   }
10038   double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
10039   double RtC = *CheckCost.getValue();
10040   double MinTC1 = RtC / (ScalarC - VecCOverVF);
10041 
10042   // Second, compute a minimum iteration count so that the cost of the
10043   // runtime checks is only a fraction of the total scalar loop cost. This
10044   // adds a loop-dependent bound on the overhead incurred if the runtime
10045   // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
10046   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
10047   // cost, compute
10048   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
10049   double MinTC2 = RtC * 10 / ScalarC;
10050 
10051   // Now pick the larger minimum. If it is not a multiple of VF, choose the
10052   // next closest multiple of VF. This should partly compensate for ignoring
10053   // the epilogue cost.
10054   uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
10055   VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF));
10056 
10057   LLVM_DEBUG(
10058       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
10059              << VF.MinProfitableTripCount << "\n");
10060 
10061   // Skip vectorization if the expected trip count is less than the minimum
10062   // required trip count.
10063   if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
10064     if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
10065                                 VF.MinProfitableTripCount)) {
10066       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
10067                            "trip count < minimum profitable VF ("
10068                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
10069                         << ")\n");
10070 
10071       return false;
10072     }
10073   }
10074   return true;
10075 }
10076 
10077 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10078     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10079                                !EnableLoopInterleaving),
10080       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10081                               !EnableLoopVectorization) {}
10082 
10083 bool LoopVectorizePass::processLoop(Loop *L) {
10084   assert((EnableVPlanNativePath || L->isInnermost()) &&
10085          "VPlan-native path is not enabled. Only process inner loops.");
10086 
10087 #ifndef NDEBUG
10088   const std::string DebugLocStr = getDebugLocString(L);
10089 #endif /* NDEBUG */
10090 
10091   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10092                     << L->getHeader()->getParent()->getName() << "' from "
10093                     << DebugLocStr << "\n");
10094 
10095   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10096 
10097   LLVM_DEBUG(
10098       dbgs() << "LV: Loop hints:"
10099              << " force="
10100              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10101                      ? "disabled"
10102                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10103                             ? "enabled"
10104                             : "?"))
10105              << " width=" << Hints.getWidth()
10106              << " interleave=" << Hints.getInterleave() << "\n");
10107 
10108   // Function containing loop
10109   Function *F = L->getHeader()->getParent();
10110 
10111   // Looking at the diagnostic output is the only way to determine if a loop
10112   // was vectorized (other than looking at the IR or machine code), so it
10113   // is important to generate an optimization remark for each loop. Most of
10114   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10115   // generated as OptimizationRemark and OptimizationRemarkMissed are
10116   // less verbose reporting vectorized loops and unvectorized loops that may
10117   // benefit from vectorization, respectively.
10118 
10119   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10120     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10121     return false;
10122   }
10123 
10124   PredicatedScalarEvolution PSE(*SE, *L);
10125 
10126   // Check if it is legal to vectorize the loop.
10127   LoopVectorizationRequirements Requirements;
10128   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10129                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10130   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10131     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10132     Hints.emitRemarkWithHints();
10133     return false;
10134   }
10135 
10136   // Check the function attributes and profiles to find out if this function
10137   // should be optimized for size.
10138   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10139       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10140 
10141   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10142   // here. They may require CFG and instruction level transformations before
10143   // even evaluating whether vectorization is profitable. Since we cannot modify
10144   // the incoming IR, we need to build VPlan upfront in the vectorization
10145   // pipeline.
10146   if (!L->isInnermost())
10147     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10148                                         ORE, BFI, PSI, Hints, Requirements);
10149 
10150   assert(L->isInnermost() && "Inner loop expected.");
10151 
10152   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10153   // count by optimizing for size, to minimize overheads.
10154   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10155   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10156     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10157                       << "This loop is worth vectorizing only if no scalar "
10158                       << "iteration overheads are incurred.");
10159     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10160       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10161     else {
10162       LLVM_DEBUG(dbgs() << "\n");
10163       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10164     }
10165   }
10166 
10167   // Check the function attributes to see if implicit floats are allowed.
10168   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10169   // an integer loop and the vector instructions selected are purely integer
10170   // vector instructions?
10171   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10172     reportVectorizationFailure(
10173         "Can't vectorize when the NoImplicitFloat attribute is used",
10174         "loop not vectorized due to NoImplicitFloat attribute",
10175         "NoImplicitFloat", ORE, L);
10176     Hints.emitRemarkWithHints();
10177     return false;
10178   }
10179 
10180   // Check if the target supports potentially unsafe FP vectorization.
10181   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10182   // for the target we're vectorizing for, to make sure none of the
10183   // additional fp-math flags can help.
10184   if (Hints.isPotentiallyUnsafe() &&
10185       TTI->isFPVectorizationPotentiallyUnsafe()) {
10186     reportVectorizationFailure(
10187         "Potentially unsafe FP op prevents vectorization",
10188         "loop not vectorized due to unsafe FP support.",
10189         "UnsafeFP", ORE, L);
10190     Hints.emitRemarkWithHints();
10191     return false;
10192   }
10193 
10194   bool AllowOrderedReductions;
10195   // If the flag is set, use that instead and override the TTI behaviour.
10196   if (ForceOrderedReductions.getNumOccurrences() > 0)
10197     AllowOrderedReductions = ForceOrderedReductions;
10198   else
10199     AllowOrderedReductions = TTI->enableOrderedReductions();
10200   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10201     ORE->emit([&]() {
10202       auto *ExactFPMathInst = Requirements.getExactFPInst();
10203       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10204                                                  ExactFPMathInst->getDebugLoc(),
10205                                                  ExactFPMathInst->getParent())
10206              << "loop not vectorized: cannot prove it is safe to reorder "
10207                 "floating-point operations";
10208     });
10209     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10210                          "reorder floating-point operations\n");
10211     Hints.emitRemarkWithHints();
10212     return false;
10213   }
10214 
10215   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10216   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10217 
10218   // If an override option has been passed in for interleaved accesses, use it.
10219   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10220     UseInterleaved = EnableInterleavedMemAccesses;
10221 
10222   // Analyze interleaved memory accesses.
10223   if (UseInterleaved) {
10224     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10225   }
10226 
10227   // Use the cost model.
10228   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10229                                 F, &Hints, IAI);
10230   CM.collectValuesToIgnore();
10231   CM.collectElementTypesForWidening();
10232 
10233   // Use the planner for vectorization.
10234   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE);
10235 
10236   // Get user vectorization factor and interleave count.
10237   ElementCount UserVF = Hints.getWidth();
10238   unsigned UserIC = Hints.getInterleave();
10239 
10240   // Plan how to best vectorize, return the best VF and its cost.
10241   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10242 
10243   VectorizationFactor VF = VectorizationFactor::Disabled();
10244   unsigned IC = 1;
10245 
10246   GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
10247                            F->getParent()->getDataLayout());
10248   if (MaybeVF) {
10249     VF = *MaybeVF;
10250     // Select the interleave count.
10251     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10252 
10253     unsigned SelectedIC = std::max(IC, UserIC);
10254     //  Optimistically generate runtime checks if they are needed. Drop them if
10255     //  they turn out to not be profitable.
10256     if (VF.Width.isVector() || SelectedIC > 1)
10257       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10258 
10259     // Check if it is profitable to vectorize with runtime checks.
10260     bool ForceVectorization =
10261         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10262     if (!ForceVectorization &&
10263         !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L,
10264                                     *PSE.getSE())) {
10265       ORE->emit([&]() {
10266         return OptimizationRemarkAnalysisAliasing(
10267                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10268                    L->getHeader())
10269                << "loop not vectorized: cannot prove it is safe to reorder "
10270                   "memory operations";
10271       });
10272       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10273       Hints.emitRemarkWithHints();
10274       return false;
10275     }
10276   }
10277 
10278   // Identify the diagnostic messages that should be produced.
10279   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10280   bool VectorizeLoop = true, InterleaveLoop = true;
10281   if (VF.Width.isScalar()) {
10282     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10283     VecDiagMsg = std::make_pair(
10284         "VectorizationNotBeneficial",
10285         "the cost-model indicates that vectorization is not beneficial");
10286     VectorizeLoop = false;
10287   }
10288 
10289   if (!MaybeVF && UserIC > 1) {
10290     // Tell the user interleaving was avoided up-front, despite being explicitly
10291     // requested.
10292     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10293                          "interleaving should be avoided up front\n");
10294     IntDiagMsg = std::make_pair(
10295         "InterleavingAvoided",
10296         "Ignoring UserIC, because interleaving was avoided up front");
10297     InterleaveLoop = false;
10298   } else if (IC == 1 && UserIC <= 1) {
10299     // Tell the user interleaving is not beneficial.
10300     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10301     IntDiagMsg = std::make_pair(
10302         "InterleavingNotBeneficial",
10303         "the cost-model indicates that interleaving is not beneficial");
10304     InterleaveLoop = false;
10305     if (UserIC == 1) {
10306       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10307       IntDiagMsg.second +=
10308           " and is explicitly disabled or interleave count is set to 1";
10309     }
10310   } else if (IC > 1 && UserIC == 1) {
10311     // Tell the user interleaving is beneficial, but it explicitly disabled.
10312     LLVM_DEBUG(
10313         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10314     IntDiagMsg = std::make_pair(
10315         "InterleavingBeneficialButDisabled",
10316         "the cost-model indicates that interleaving is beneficial "
10317         "but is explicitly disabled or interleave count is set to 1");
10318     InterleaveLoop = false;
10319   }
10320 
10321   // Override IC if user provided an interleave count.
10322   IC = UserIC > 0 ? UserIC : IC;
10323 
10324   // Emit diagnostic messages, if any.
10325   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10326   if (!VectorizeLoop && !InterleaveLoop) {
10327     // Do not vectorize or interleaving the loop.
10328     ORE->emit([&]() {
10329       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10330                                       L->getStartLoc(), L->getHeader())
10331              << VecDiagMsg.second;
10332     });
10333     ORE->emit([&]() {
10334       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10335                                       L->getStartLoc(), L->getHeader())
10336              << IntDiagMsg.second;
10337     });
10338     return false;
10339   } else if (!VectorizeLoop && InterleaveLoop) {
10340     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10341     ORE->emit([&]() {
10342       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10343                                         L->getStartLoc(), L->getHeader())
10344              << VecDiagMsg.second;
10345     });
10346   } else if (VectorizeLoop && !InterleaveLoop) {
10347     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10348                       << ") in " << DebugLocStr << '\n');
10349     ORE->emit([&]() {
10350       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10351                                         L->getStartLoc(), L->getHeader())
10352              << IntDiagMsg.second;
10353     });
10354   } else if (VectorizeLoop && InterleaveLoop) {
10355     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10356                       << ") in " << DebugLocStr << '\n');
10357     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10358   }
10359 
10360   bool DisableRuntimeUnroll = false;
10361   MDNode *OrigLoopID = L->getLoopID();
10362   {
10363     using namespace ore;
10364     if (!VectorizeLoop) {
10365       assert(IC > 1 && "interleave count should not be 1 or 0");
10366       // If we decided that it is not legal to vectorize the loop, then
10367       // interleave it.
10368       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10369                                  &CM, BFI, PSI, Checks);
10370 
10371       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10372       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10373 
10374       ORE->emit([&]() {
10375         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10376                                   L->getHeader())
10377                << "interleaved loop (interleaved count: "
10378                << NV("InterleaveCount", IC) << ")";
10379       });
10380     } else {
10381       // If we decided that it is *legal* to vectorize the loop, then do it.
10382 
10383       // Consider vectorizing the epilogue too if it's profitable.
10384       VectorizationFactor EpilogueVF =
10385           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10386       if (EpilogueVF.Width.isVector()) {
10387 
10388         // The first pass vectorizes the main loop and creates a scalar epilogue
10389         // to be vectorized by executing the plan (potentially with a different
10390         // factor) again shortly afterwards.
10391         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10392         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10393                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10394 
10395         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10396         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10397                         DT, true);
10398         ++LoopsVectorized;
10399 
10400         // Second pass vectorizes the epilogue and adjusts the control flow
10401         // edges from the first pass.
10402         EPI.MainLoopVF = EPI.EpilogueVF;
10403         EPI.MainLoopUF = EPI.EpilogueUF;
10404         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10405                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10406                                                  Checks);
10407 
10408         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10409         VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10410         VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10411         Header->setName("vec.epilog.vector.body");
10412 
10413         // Ensure that the start values for any VPReductionPHIRecipes are
10414         // updated before vectorising the epilogue loop.
10415         for (VPRecipeBase &R : Header->phis()) {
10416           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10417             if (auto *Resume = MainILV.getReductionResumeValue(
10418                     ReductionPhi->getRecurrenceDescriptor())) {
10419               VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume);
10420               ReductionPhi->setOperand(0, StartVal);
10421             }
10422           }
10423         }
10424 
10425         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10426                         DT, true);
10427         ++LoopsEpilogueVectorized;
10428 
10429         if (!MainILV.areSafetyChecksAdded())
10430           DisableRuntimeUnroll = true;
10431       } else {
10432         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10433                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10434                                PSI, Checks);
10435 
10436         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10437         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10438         ++LoopsVectorized;
10439 
10440         // Add metadata to disable runtime unrolling a scalar loop when there
10441         // are no runtime checks about strides and memory. A scalar loop that is
10442         // rarely used is not worth unrolling.
10443         if (!LB.areSafetyChecksAdded())
10444           DisableRuntimeUnroll = true;
10445       }
10446       // Report the vectorization decision.
10447       ORE->emit([&]() {
10448         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10449                                   L->getHeader())
10450                << "vectorized loop (vectorization width: "
10451                << NV("VectorizationFactor", VF.Width)
10452                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10453       });
10454     }
10455 
10456     if (ORE->allowExtraAnalysis(LV_NAME))
10457       checkMixedPrecision(L, ORE);
10458   }
10459 
10460   Optional<MDNode *> RemainderLoopID =
10461       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10462                                       LLVMLoopVectorizeFollowupEpilogue});
10463   if (RemainderLoopID) {
10464     L->setLoopID(RemainderLoopID.value());
10465   } else {
10466     if (DisableRuntimeUnroll)
10467       AddRuntimeUnrollDisableMetaData(L);
10468 
10469     // Mark the loop as already vectorized to avoid vectorizing again.
10470     Hints.setAlreadyVectorized();
10471   }
10472 
10473   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10474   return true;
10475 }
10476 
10477 LoopVectorizeResult LoopVectorizePass::runImpl(
10478     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10479     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10480     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10481     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10482     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10483   SE = &SE_;
10484   LI = &LI_;
10485   TTI = &TTI_;
10486   DT = &DT_;
10487   BFI = &BFI_;
10488   TLI = TLI_;
10489   AA = &AA_;
10490   AC = &AC_;
10491   GetLAA = &GetLAA_;
10492   DB = &DB_;
10493   ORE = &ORE_;
10494   PSI = PSI_;
10495 
10496   // Don't attempt if
10497   // 1. the target claims to have no vector registers, and
10498   // 2. interleaving won't help ILP.
10499   //
10500   // The second condition is necessary because, even if the target has no
10501   // vector registers, loop vectorization may still enable scalar
10502   // interleaving.
10503   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10504       TTI->getMaxInterleaveFactor(1) < 2)
10505     return LoopVectorizeResult(false, false);
10506 
10507   bool Changed = false, CFGChanged = false;
10508 
10509   // The vectorizer requires loops to be in simplified form.
10510   // Since simplification may add new inner loops, it has to run before the
10511   // legality and profitability checks. This means running the loop vectorizer
10512   // will simplify all loops, regardless of whether anything end up being
10513   // vectorized.
10514   for (auto &L : *LI)
10515     Changed |= CFGChanged |=
10516         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10517 
10518   // Build up a worklist of inner-loops to vectorize. This is necessary as
10519   // the act of vectorizing or partially unrolling a loop creates new loops
10520   // and can invalidate iterators across the loops.
10521   SmallVector<Loop *, 8> Worklist;
10522 
10523   for (Loop *L : *LI)
10524     collectSupportedLoops(*L, LI, ORE, Worklist);
10525 
10526   LoopsAnalyzed += Worklist.size();
10527 
10528   // Now walk the identified inner loops.
10529   while (!Worklist.empty()) {
10530     Loop *L = Worklist.pop_back_val();
10531 
10532     // For the inner loops we actually process, form LCSSA to simplify the
10533     // transform.
10534     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10535 
10536     Changed |= CFGChanged |= processLoop(L);
10537   }
10538 
10539   // Process each loop nest in the function.
10540   return LoopVectorizeResult(Changed, CFGChanged);
10541 }
10542 
10543 PreservedAnalyses LoopVectorizePass::run(Function &F,
10544                                          FunctionAnalysisManager &AM) {
10545     auto &LI = AM.getResult<LoopAnalysis>(F);
10546     // There are no loops in the function. Return before computing other expensive
10547     // analyses.
10548     if (LI.empty())
10549       return PreservedAnalyses::all();
10550     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10551     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10552     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10553     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10554     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10555     auto &AA = AM.getResult<AAManager>(F);
10556     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10557     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10558     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10559 
10560     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10561     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10562         [&](Loop &L) -> const LoopAccessInfo & {
10563       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10564                                         TLI, TTI, nullptr, nullptr, nullptr};
10565       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10566     };
10567     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10568     ProfileSummaryInfo *PSI =
10569         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10570     LoopVectorizeResult Result =
10571         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10572     if (!Result.MadeAnyChange)
10573       return PreservedAnalyses::all();
10574     PreservedAnalyses PA;
10575 
10576     // We currently do not preserve loopinfo/dominator analyses with outer loop
10577     // vectorization. Until this is addressed, mark these analyses as preserved
10578     // only for non-VPlan-native path.
10579     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10580     if (!EnableVPlanNativePath) {
10581       PA.preserve<LoopAnalysis>();
10582       PA.preserve<DominatorTreeAnalysis>();
10583     }
10584 
10585     if (Result.MadeCFGChange) {
10586       // Making CFG changes likely means a loop got vectorized. Indicate that
10587       // extra simplification passes should be run.
10588       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10589       // be run if runtime checks have been added.
10590       AM.getResult<ShouldRunExtraVectorPasses>(F);
10591       PA.preserve<ShouldRunExtraVectorPasses>();
10592     } else {
10593       PA.preserveSet<CFGAnalyses>();
10594     }
10595     return PA;
10596 }
10597 
10598 void LoopVectorizePass::printPipeline(
10599     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10600   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10601       OS, MapClassName2PassName);
10602 
10603   OS << "<";
10604   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10605   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10606   OS << ">";
10607 }
10608