1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/LLVMContext.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/PatternMatch.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/InstructionCost.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
142 #include "llvm/Transforms/Utils/SizeOpts.h"
143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
144 #include <algorithm>
145 #include <cassert>
146 #include <cstdint>
147 #include <cstdlib>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
202     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks with a "
204              "vectorize(enable) pragma."));
205 
206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207 // that predication is preferred, and this lists all options. I.e., the
208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
209 // and predicate the instructions accordingly. If tail-folding fails, there are
210 // different fallback strategies depending on these values:
211 namespace PreferPredicateTy {
212   enum Option {
213     ScalarEpilogue = 0,
214     PredicateElseScalarEpilogue,
215     PredicateOrDontVectorize
216   };
217 } // namespace PreferPredicateTy
218 
219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
220     "prefer-predicate-over-epilogue",
221     cl::init(PreferPredicateTy::ScalarEpilogue),
222     cl::Hidden,
223     cl::desc("Tail-folding and predication preferences over creating a scalar "
224              "epilogue loop."),
225     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
226                          "scalar-epilogue",
227                          "Don't tail-predicate loops, create scalar epilogue"),
228               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
229                          "predicate-else-scalar-epilogue",
230                          "prefer tail-folding, create scalar epilogue if tail "
231                          "folding fails."),
232               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
233                          "predicate-dont-vectorize",
234                          "prefers tail-folding, don't attempt vectorization if "
235                          "tail-folding fails.")));
236 
237 static cl::opt<bool> MaximizeBandwidth(
238     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
239     cl::desc("Maximize bandwidth when selecting vectorization factor which "
240              "will be determined by the smallest type in loop."));
241 
242 static cl::opt<bool> EnableInterleavedMemAccesses(
243     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
245 
246 /// An interleave-group may need masking if it resides in a block that needs
247 /// predication, or in order to mask away gaps.
248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
249     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
250     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
251 
252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
253     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
254     cl::desc("We don't interleave loops with a estimated constant trip count "
255              "below this number"));
256 
257 static cl::opt<unsigned> ForceTargetNumScalarRegs(
258     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
259     cl::desc("A flag that overrides the target's number of scalar registers."));
260 
261 static cl::opt<unsigned> ForceTargetNumVectorRegs(
262     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
263     cl::desc("A flag that overrides the target's number of vector registers."));
264 
265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
266     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
267     cl::desc("A flag that overrides the target's max interleave factor for "
268              "scalar loops."));
269 
270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
271     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
272     cl::desc("A flag that overrides the target's max interleave factor for "
273              "vectorized loops."));
274 
275 static cl::opt<unsigned> ForceTargetInstructionCost(
276     "force-target-instruction-cost", cl::init(0), cl::Hidden,
277     cl::desc("A flag that overrides the target's expected cost for "
278              "an instruction to a single constant value. Mostly "
279              "useful for getting consistent testing."));
280 
281 static cl::opt<bool> ForceTargetSupportsScalableVectors(
282     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
283     cl::desc(
284         "Pretend that scalable vectors are supported, even if the target does "
285         "not support them. This flag should only be used for testing."));
286 
287 static cl::opt<unsigned> SmallLoopCost(
288     "small-loop-cost", cl::init(20), cl::Hidden,
289     cl::desc(
290         "The cost of a loop that is considered 'small' by the interleaver."));
291 
292 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
293     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
294     cl::desc("Enable the use of the block frequency analysis to access PGO "
295              "heuristics minimizing code growth in cold regions and being more "
296              "aggressive in hot regions."));
297 
298 // Runtime interleave loops for load/store throughput.
299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
300     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
301     cl::desc(
302         "Enable runtime interleaving until load/store ports are saturated"));
303 
304 /// Interleave small loops with scalar reductions.
305 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
306     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
307     cl::desc("Enable interleaving for loops with small iteration counts that "
308              "contain scalar reductions to expose ILP."));
309 
310 /// The number of stores in a loop that are allowed to need predication.
311 static cl::opt<unsigned> NumberOfStoresToPredicate(
312     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
313     cl::desc("Max number of stores to be predicated behind an if."));
314 
315 static cl::opt<bool> EnableIndVarRegisterHeur(
316     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
317     cl::desc("Count the induction variable only once when interleaving"));
318 
319 static cl::opt<bool> EnableCondStoresVectorization(
320     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
321     cl::desc("Enable if predication of stores during vectorization."));
322 
323 static cl::opt<unsigned> MaxNestedScalarReductionIC(
324     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
325     cl::desc("The maximum interleave count to use when interleaving a scalar "
326              "reduction in a nested loop."));
327 
328 static cl::opt<bool>
329     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
330                            cl::Hidden,
331                            cl::desc("Prefer in-loop vector reductions, "
332                                     "overriding the targets preference."));
333 
334 static cl::opt<bool> ForceOrderedReductions(
335     "force-ordered-reductions", cl::init(false), cl::Hidden,
336     cl::desc("Enable the vectorisation of loops with in-order (strict) "
337              "FP reductions"));
338 
339 static cl::opt<bool> PreferPredicatedReductionSelect(
340     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
341     cl::desc(
342         "Prefer predicating a reduction operation over an after loop select."));
343 
344 cl::opt<bool> EnableVPlanNativePath(
345     "enable-vplan-native-path", cl::init(false), cl::Hidden,
346     cl::desc("Enable VPlan-native vectorization path with "
347              "support for outer loop vectorization."));
348 
349 // FIXME: Remove this switch once we have divergence analysis. Currently we
350 // assume divergent non-backedge branches when this switch is true.
351 cl::opt<bool> EnableVPlanPredication(
352     "enable-vplan-predication", cl::init(false), cl::Hidden,
353     cl::desc("Enable VPlan-native vectorization path predicator with "
354              "support for outer loop vectorization."));
355 
356 // This flag enables the stress testing of the VPlan H-CFG construction in the
357 // VPlan-native vectorization path. It must be used in conjuction with
358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
359 // verification of the H-CFGs built.
360 static cl::opt<bool> VPlanBuildStressTest(
361     "vplan-build-stress-test", cl::init(false), cl::Hidden,
362     cl::desc(
363         "Build VPlan for every supported loop nest in the function and bail "
364         "out right after the build (stress test the VPlan H-CFG construction "
365         "in the VPlan-native vectorization path)."));
366 
367 cl::opt<bool> llvm::EnableLoopInterleaving(
368     "interleave-loops", cl::init(true), cl::Hidden,
369     cl::desc("Enable loop interleaving in Loop vectorization passes"));
370 cl::opt<bool> llvm::EnableLoopVectorization(
371     "vectorize-loops", cl::init(true), cl::Hidden,
372     cl::desc("Run the Loop vectorization passes"));
373 
374 cl::opt<bool> PrintVPlansInDotFormat(
375     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
376     cl::desc("Use dot format instead of plain text when dumping VPlans"));
377 
378 /// A helper function that returns true if the given type is irregular. The
379 /// type is irregular if its allocated size doesn't equal the store size of an
380 /// element of the corresponding vector type.
381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
382   // Determine if an array of N elements of type Ty is "bitcast compatible"
383   // with a <N x Ty> vector.
384   // This is only true if there is no padding between the array elements.
385   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
386 }
387 
388 /// A helper function that returns the reciprocal of the block probability of
389 /// predicated blocks. If we return X, we are assuming the predicated block
390 /// will execute once for every X iterations of the loop header.
391 ///
392 /// TODO: We should use actual block probability here, if available. Currently,
393 ///       we always assume predicated blocks have a 50% chance of executing.
394 static unsigned getReciprocalPredBlockProb() { return 2; }
395 
396 /// A helper function that returns an integer or floating-point constant with
397 /// value C.
398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
399   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
400                            : ConstantFP::get(Ty, C);
401 }
402 
403 /// Returns "best known" trip count for the specified loop \p L as defined by
404 /// the following procedure:
405 ///   1) Returns exact trip count if it is known.
406 ///   2) Returns expected trip count according to profile data if any.
407 ///   3) Returns upper bound estimate if it is known.
408 ///   4) Returns None if all of the above failed.
409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
410   // Check if exact trip count is known.
411   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
412     return ExpectedTC;
413 
414   // Check if there is an expected trip count available from profile data.
415   if (LoopVectorizeWithBlockFrequency)
416     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
417       return EstimatedTC;
418 
419   // Check if upper bound estimate is known.
420   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
421     return ExpectedTC;
422 
423   return None;
424 }
425 
426 // Forward declare GeneratedRTChecks.
427 class GeneratedRTChecks;
428 
429 namespace llvm {
430 
431 AnalysisKey ShouldRunExtraVectorPasses::Key;
432 
433 /// InnerLoopVectorizer vectorizes loops which contain only one basic
434 /// block to a specified vectorization factor (VF).
435 /// This class performs the widening of scalars into vectors, or multiple
436 /// scalars. This class also implements the following features:
437 /// * It inserts an epilogue loop for handling loops that don't have iteration
438 ///   counts that are known to be a multiple of the vectorization factor.
439 /// * It handles the code generation for reduction variables.
440 /// * Scalarization (implementation using scalars) of un-vectorizable
441 ///   instructions.
442 /// InnerLoopVectorizer does not perform any vectorization-legality
443 /// checks, and relies on the caller to check for the different legality
444 /// aspects. The InnerLoopVectorizer relies on the
445 /// LoopVectorizationLegality class to provide information about the induction
446 /// and reduction variables that were found to a given vectorization factor.
447 class InnerLoopVectorizer {
448 public:
449   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
450                       LoopInfo *LI, DominatorTree *DT,
451                       const TargetLibraryInfo *TLI,
452                       const TargetTransformInfo *TTI, AssumptionCache *AC,
453                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
454                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
455                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
456                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
457       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
458         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
459         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
460         PSI(PSI), RTChecks(RTChecks) {
461     // Query this against the original loop and save it here because the profile
462     // of the original loop header may change as the transformation happens.
463     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
464         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
465   }
466 
467   virtual ~InnerLoopVectorizer() = default;
468 
469   /// Create a new empty loop that will contain vectorized instructions later
470   /// on, while the old loop will be used as the scalar remainder. Control flow
471   /// is generated around the vectorized (and scalar epilogue) loops consisting
472   /// of various checks and bypasses. Return the pre-header block of the new
473   /// loop and the start value for the canonical induction, if it is != 0. The
474   /// latter is the case when vectorizing the epilogue loop. In the case of
475   /// epilogue vectorization, this function is overriden to handle the more
476   /// complex control flow around the loops.
477   virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
478 
479   /// Widen a single call instruction within the innermost loop.
480   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
481                             VPTransformState &State);
482 
483   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
484   void fixVectorizedLoop(VPTransformState &State);
485 
486   // Return true if any runtime check is added.
487   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
488 
489   /// A type for vectorized values in the new loop. Each value from the
490   /// original loop, when vectorized, is represented by UF vector values in the
491   /// new unrolled loop, where UF is the unroll factor.
492   using VectorParts = SmallVector<Value *, 2>;
493 
494   /// Vectorize a single first-order recurrence or pointer induction PHINode in
495   /// a block. This method handles the induction variable canonicalization. It
496   /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
497   void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
498                            VPTransformState &State);
499 
500   /// A helper function to scalarize a single Instruction in the innermost loop.
501   /// Generates a sequence of scalar instances for each lane between \p MinLane
502   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
503   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
504   /// Instr's operands.
505   void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
506                             const VPIteration &Instance, bool IfPredicateInstr,
507                             VPTransformState &State);
508 
509   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
510   /// is provided, the integer induction variable will first be truncated to
511   /// the corresponding type. \p CanonicalIV is the scalar value generated for
512   /// the canonical induction variable.
513   void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def,
514                              VPTransformState &State, Value *CanonicalIV);
515 
516   /// Construct the vector value of a scalarized value \p V one lane at a time.
517   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
518                                  VPTransformState &State);
519 
520   /// Try to vectorize interleaved access group \p Group with the base address
521   /// given in \p Addr, optionally masking the vector operations if \p
522   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
523   /// values in the vectorized loop.
524   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
525                                 ArrayRef<VPValue *> VPDefs,
526                                 VPTransformState &State, VPValue *Addr,
527                                 ArrayRef<VPValue *> StoredValues,
528                                 VPValue *BlockInMask = nullptr);
529 
530   /// Set the debug location in the builder \p Ptr using the debug location in
531   /// \p V. If \p Ptr is None then it uses the class member's Builder.
532   void setDebugLocFromInst(const Value *V,
533                            Optional<IRBuilderBase *> CustomBuilder = None);
534 
535   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
536   void fixNonInductionPHIs(VPTransformState &State);
537 
538   /// Returns true if the reordering of FP operations is not allowed, but we are
539   /// able to vectorize with strict in-order reductions for the given RdxDesc.
540   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
541 
542   /// Create a broadcast instruction. This method generates a broadcast
543   /// instruction (shuffle) for loop invariant values and for the induction
544   /// value. If this is the induction variable then we extend it to N, N+1, ...
545   /// this is needed because each iteration in the loop corresponds to a SIMD
546   /// element.
547   virtual Value *getBroadcastInstrs(Value *V);
548 
549   /// Add metadata from one instruction to another.
550   ///
551   /// This includes both the original MDs from \p From and additional ones (\see
552   /// addNewMetadata).  Use this for *newly created* instructions in the vector
553   /// loop.
554   void addMetadata(Instruction *To, Instruction *From);
555 
556   /// Similar to the previous function but it adds the metadata to a
557   /// vector of instructions.
558   void addMetadata(ArrayRef<Value *> To, Instruction *From);
559 
560   // Returns the resume value (bc.merge.rdx) for a reduction as
561   // generated by fixReduction.
562   PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
563 
564 protected:
565   friend class LoopVectorizationPlanner;
566 
567   /// A small list of PHINodes.
568   using PhiVector = SmallVector<PHINode *, 4>;
569 
570   /// A type for scalarized values in the new loop. Each value from the
571   /// original loop, when scalarized, is represented by UF x VF scalar values
572   /// in the new unrolled loop, where UF is the unroll factor and VF is the
573   /// vectorization factor.
574   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
575 
576   /// Set up the values of the IVs correctly when exiting the vector loop.
577   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
578                     Value *CountRoundDown, Value *EndValue,
579                     BasicBlock *MiddleBlock);
580 
581   /// Introduce a conditional branch (on true, condition to be set later) at the
582   /// end of the header=latch connecting it to itself (across the backedge) and
583   /// to the exit block of \p L.
584   void createHeaderBranch(Loop *L);
585 
586   /// Handle all cross-iteration phis in the header.
587   void fixCrossIterationPHIs(VPTransformState &State);
588 
589   /// Create the exit value of first order recurrences in the middle block and
590   /// update their users.
591   void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
592                                VPTransformState &State);
593 
594   /// Create code for the loop exit value of the reduction.
595   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
596 
597   /// Clear NSW/NUW flags from reduction instructions if necessary.
598   void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
599                                VPTransformState &State);
600 
601   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
602   /// means we need to add the appropriate incoming value from the middle
603   /// block as exiting edges from the scalar epilogue loop (if present) are
604   /// already in place, and we exit the vector loop exclusively to the middle
605   /// block.
606   void fixLCSSAPHIs(VPTransformState &State);
607 
608   /// Iteratively sink the scalarized operands of a predicated instruction into
609   /// the block that was created for it.
610   void sinkScalarOperands(Instruction *PredInst);
611 
612   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
613   /// represented as.
614   void truncateToMinimalBitwidths(VPTransformState &State);
615 
616   /// Create a vector induction phi node based on an existing scalar one. \p
617   /// EntryVal is the value from the original loop that maps to the vector phi
618   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
619   /// truncate instruction, instead of widening the original IV, we widen a
620   /// version of the IV truncated to \p EntryVal's type.
621   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
622                                        Value *Step, Value *Start,
623                                        Instruction *EntryVal, VPValue *Def,
624                                        VPTransformState &State);
625 
626   /// Returns (and creates if needed) the original loop trip count.
627   Value *getOrCreateTripCount(Loop *NewLoop);
628 
629   /// Returns (and creates if needed) the trip count of the widened loop.
630   Value *getOrCreateVectorTripCount(Loop *NewLoop);
631 
632   /// Returns a bitcasted value to the requested vector type.
633   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
634   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
635                                 const DataLayout &DL);
636 
637   /// Emit a bypass check to see if the vector trip count is zero, including if
638   /// it overflows.
639   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
640 
641   /// Emit a bypass check to see if all of the SCEV assumptions we've
642   /// had to make are correct. Returns the block containing the checks or
643   /// nullptr if no checks have been added.
644   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
645 
646   /// Emit bypass checks to check any memory assumptions we may have made.
647   /// Returns the block containing the checks or nullptr if no checks have been
648   /// added.
649   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
650 
651   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
652   /// vector loop preheader, middle block and scalar preheader. Also
653   /// allocate a loop object for the new vector loop and return it.
654   Loop *createVectorLoopSkeleton(StringRef Prefix);
655 
656   /// Create new phi nodes for the induction variables to resume iteration count
657   /// in the scalar epilogue, from where the vectorized loop left off.
658   /// In cases where the loop skeleton is more complicated (eg. epilogue
659   /// vectorization) and the resume values can come from an additional bypass
660   /// block, the \p AdditionalBypass pair provides information about the bypass
661   /// block and the end value on the edge from bypass to this loop.
662   void createInductionResumeValues(
663       Loop *L,
664       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
665 
666   /// Complete the loop skeleton by adding debug MDs, creating appropriate
667   /// conditional branches in the middle block, preparing the builder and
668   /// running the verifier. Take in the vector loop \p L as argument, and return
669   /// the preheader of the completed vector loop.
670   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
671 
672   /// Add additional metadata to \p To that was not present on \p Orig.
673   ///
674   /// Currently this is used to add the noalias annotations based on the
675   /// inserted memchecks.  Use this for instructions that are *cloned* into the
676   /// vector loop.
677   void addNewMetadata(Instruction *To, const Instruction *Orig);
678 
679   /// Collect poison-generating recipes that may generate a poison value that is
680   /// used after vectorization, even when their operands are not poison. Those
681   /// recipes meet the following conditions:
682   ///  * Contribute to the address computation of a recipe generating a widen
683   ///    memory load/store (VPWidenMemoryInstructionRecipe or
684   ///    VPInterleaveRecipe).
685   ///  * Such a widen memory load/store has at least one underlying Instruction
686   ///    that is in a basic block that needs predication and after vectorization
687   ///    the generated instruction won't be predicated.
688   void collectPoisonGeneratingRecipes(VPTransformState &State);
689 
690   /// Allow subclasses to override and print debug traces before/after vplan
691   /// execution, when trace information is requested.
692   virtual void printDebugTracesAtStart(){};
693   virtual void printDebugTracesAtEnd(){};
694 
695   /// The original loop.
696   Loop *OrigLoop;
697 
698   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
699   /// dynamic knowledge to simplify SCEV expressions and converts them to a
700   /// more usable form.
701   PredicatedScalarEvolution &PSE;
702 
703   /// Loop Info.
704   LoopInfo *LI;
705 
706   /// Dominator Tree.
707   DominatorTree *DT;
708 
709   /// Alias Analysis.
710   AAResults *AA;
711 
712   /// Target Library Info.
713   const TargetLibraryInfo *TLI;
714 
715   /// Target Transform Info.
716   const TargetTransformInfo *TTI;
717 
718   /// Assumption Cache.
719   AssumptionCache *AC;
720 
721   /// Interface to emit optimization remarks.
722   OptimizationRemarkEmitter *ORE;
723 
724   /// LoopVersioning.  It's only set up (non-null) if memchecks were
725   /// used.
726   ///
727   /// This is currently only used to add no-alias metadata based on the
728   /// memchecks.  The actually versioning is performed manually.
729   std::unique_ptr<LoopVersioning> LVer;
730 
731   /// The vectorization SIMD factor to use. Each vector will have this many
732   /// vector elements.
733   ElementCount VF;
734 
735   /// The vectorization unroll factor to use. Each scalar is vectorized to this
736   /// many different vector instructions.
737   unsigned UF;
738 
739   /// The builder that we use
740   IRBuilder<> Builder;
741 
742   // --- Vectorization state ---
743 
744   /// The vector-loop preheader.
745   BasicBlock *LoopVectorPreHeader;
746 
747   /// The scalar-loop preheader.
748   BasicBlock *LoopScalarPreHeader;
749 
750   /// Middle Block between the vector and the scalar.
751   BasicBlock *LoopMiddleBlock;
752 
753   /// The unique ExitBlock of the scalar loop if one exists.  Note that
754   /// there can be multiple exiting edges reaching this block.
755   BasicBlock *LoopExitBlock;
756 
757   /// The vector loop body.
758   BasicBlock *LoopVectorBody;
759 
760   /// The scalar loop body.
761   BasicBlock *LoopScalarBody;
762 
763   /// A list of all bypass blocks. The first block is the entry of the loop.
764   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
765 
766   /// Store instructions that were predicated.
767   SmallVector<Instruction *, 4> PredicatedInstructions;
768 
769   /// Trip count of the original loop.
770   Value *TripCount = nullptr;
771 
772   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
773   Value *VectorTripCount = nullptr;
774 
775   /// The legality analysis.
776   LoopVectorizationLegality *Legal;
777 
778   /// The profitablity analysis.
779   LoopVectorizationCostModel *Cost;
780 
781   // Record whether runtime checks are added.
782   bool AddedSafetyChecks = false;
783 
784   // Holds the end values for each induction variable. We save the end values
785   // so we can later fix-up the external users of the induction variables.
786   DenseMap<PHINode *, Value *> IVEndValues;
787 
788   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
789   // fixed up at the end of vector code generation.
790   SmallVector<PHINode *, 8> OrigPHIsToFix;
791 
792   /// BFI and PSI are used to check for profile guided size optimizations.
793   BlockFrequencyInfo *BFI;
794   ProfileSummaryInfo *PSI;
795 
796   // Whether this loop should be optimized for size based on profile guided size
797   // optimizatios.
798   bool OptForSizeBasedOnProfile;
799 
800   /// Structure to hold information about generated runtime checks, responsible
801   /// for cleaning the checks, if vectorization turns out unprofitable.
802   GeneratedRTChecks &RTChecks;
803 
804   // Holds the resume values for reductions in the loops, used to set the
805   // correct start value of reduction PHIs when vectorizing the epilogue.
806   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
807       ReductionResumeValues;
808 };
809 
810 class InnerLoopUnroller : public InnerLoopVectorizer {
811 public:
812   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
813                     LoopInfo *LI, DominatorTree *DT,
814                     const TargetLibraryInfo *TLI,
815                     const TargetTransformInfo *TTI, AssumptionCache *AC,
816                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
817                     LoopVectorizationLegality *LVL,
818                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
819                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
820       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
821                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
822                             BFI, PSI, Check) {}
823 
824 private:
825   Value *getBroadcastInstrs(Value *V) override;
826 };
827 
828 /// Encapsulate information regarding vectorization of a loop and its epilogue.
829 /// This information is meant to be updated and used across two stages of
830 /// epilogue vectorization.
831 struct EpilogueLoopVectorizationInfo {
832   ElementCount MainLoopVF = ElementCount::getFixed(0);
833   unsigned MainLoopUF = 0;
834   ElementCount EpilogueVF = ElementCount::getFixed(0);
835   unsigned EpilogueUF = 0;
836   BasicBlock *MainLoopIterationCountCheck = nullptr;
837   BasicBlock *EpilogueIterationCountCheck = nullptr;
838   BasicBlock *SCEVSafetyCheck = nullptr;
839   BasicBlock *MemSafetyCheck = nullptr;
840   Value *TripCount = nullptr;
841   Value *VectorTripCount = nullptr;
842 
843   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
844                                 ElementCount EVF, unsigned EUF)
845       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
846     assert(EUF == 1 &&
847            "A high UF for the epilogue loop is likely not beneficial.");
848   }
849 };
850 
851 /// An extension of the inner loop vectorizer that creates a skeleton for a
852 /// vectorized loop that has its epilogue (residual) also vectorized.
853 /// The idea is to run the vplan on a given loop twice, firstly to setup the
854 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
855 /// from the first step and vectorize the epilogue.  This is achieved by
856 /// deriving two concrete strategy classes from this base class and invoking
857 /// them in succession from the loop vectorizer planner.
858 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
859 public:
860   InnerLoopAndEpilogueVectorizer(
861       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
862       DominatorTree *DT, const TargetLibraryInfo *TLI,
863       const TargetTransformInfo *TTI, AssumptionCache *AC,
864       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
865       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
866       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
867       GeneratedRTChecks &Checks)
868       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
869                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
870                             Checks),
871         EPI(EPI) {}
872 
873   // Override this function to handle the more complex control flow around the
874   // three loops.
875   std::pair<BasicBlock *, Value *>
876   createVectorizedLoopSkeleton() final override {
877     return createEpilogueVectorizedLoopSkeleton();
878   }
879 
880   /// The interface for creating a vectorized skeleton using one of two
881   /// different strategies, each corresponding to one execution of the vplan
882   /// as described above.
883   virtual std::pair<BasicBlock *, Value *>
884   createEpilogueVectorizedLoopSkeleton() = 0;
885 
886   /// Holds and updates state information required to vectorize the main loop
887   /// and its epilogue in two separate passes. This setup helps us avoid
888   /// regenerating and recomputing runtime safety checks. It also helps us to
889   /// shorten the iteration-count-check path length for the cases where the
890   /// iteration count of the loop is so small that the main vector loop is
891   /// completely skipped.
892   EpilogueLoopVectorizationInfo &EPI;
893 };
894 
895 /// A specialized derived class of inner loop vectorizer that performs
896 /// vectorization of *main* loops in the process of vectorizing loops and their
897 /// epilogues.
898 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
899 public:
900   EpilogueVectorizerMainLoop(
901       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
902       DominatorTree *DT, const TargetLibraryInfo *TLI,
903       const TargetTransformInfo *TTI, AssumptionCache *AC,
904       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
905       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
906       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
907       GeneratedRTChecks &Check)
908       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
909                                        EPI, LVL, CM, BFI, PSI, Check) {}
910   /// Implements the interface for creating a vectorized skeleton using the
911   /// *main loop* strategy (ie the first pass of vplan execution).
912   std::pair<BasicBlock *, Value *>
913   createEpilogueVectorizedLoopSkeleton() final override;
914 
915 protected:
916   /// Emits an iteration count bypass check once for the main loop (when \p
917   /// ForEpilogue is false) and once for the epilogue loop (when \p
918   /// ForEpilogue is true).
919   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
920                                              bool ForEpilogue);
921   void printDebugTracesAtStart() override;
922   void printDebugTracesAtEnd() override;
923 };
924 
925 // A specialized derived class of inner loop vectorizer that performs
926 // vectorization of *epilogue* loops in the process of vectorizing loops and
927 // their epilogues.
928 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
929 public:
930   EpilogueVectorizerEpilogueLoop(
931       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
932       DominatorTree *DT, const TargetLibraryInfo *TLI,
933       const TargetTransformInfo *TTI, AssumptionCache *AC,
934       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
935       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
936       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
937       GeneratedRTChecks &Checks)
938       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
939                                        EPI, LVL, CM, BFI, PSI, Checks) {}
940   /// Implements the interface for creating a vectorized skeleton using the
941   /// *epilogue loop* strategy (ie the second pass of vplan execution).
942   std::pair<BasicBlock *, Value *>
943   createEpilogueVectorizedLoopSkeleton() final override;
944 
945 protected:
946   /// Emits an iteration count bypass check after the main vector loop has
947   /// finished to see if there are any iterations left to execute by either
948   /// the vector epilogue or the scalar epilogue.
949   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
950                                                       BasicBlock *Bypass,
951                                                       BasicBlock *Insert);
952   void printDebugTracesAtStart() override;
953   void printDebugTracesAtEnd() override;
954 };
955 } // end namespace llvm
956 
957 /// Look for a meaningful debug location on the instruction or it's
958 /// operands.
959 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
960   if (!I)
961     return I;
962 
963   DebugLoc Empty;
964   if (I->getDebugLoc() != Empty)
965     return I;
966 
967   for (Use &Op : I->operands()) {
968     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
969       if (OpInst->getDebugLoc() != Empty)
970         return OpInst;
971   }
972 
973   return I;
974 }
975 
976 void InnerLoopVectorizer::setDebugLocFromInst(
977     const Value *V, Optional<IRBuilderBase *> CustomBuilder) {
978   IRBuilderBase *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
979   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
980     const DILocation *DIL = Inst->getDebugLoc();
981 
982     // When a FSDiscriminator is enabled, we don't need to add the multiply
983     // factors to the discriminators.
984     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
985         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
986       // FIXME: For scalable vectors, assume vscale=1.
987       auto NewDIL =
988           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
989       if (NewDIL)
990         B->SetCurrentDebugLocation(NewDIL.getValue());
991       else
992         LLVM_DEBUG(dbgs()
993                    << "Failed to create new discriminator: "
994                    << DIL->getFilename() << " Line: " << DIL->getLine());
995     } else
996       B->SetCurrentDebugLocation(DIL);
997   } else
998     B->SetCurrentDebugLocation(DebugLoc());
999 }
1000 
1001 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1002 /// is passed, the message relates to that particular instruction.
1003 #ifndef NDEBUG
1004 static void debugVectorizationMessage(const StringRef Prefix,
1005                                       const StringRef DebugMsg,
1006                                       Instruction *I) {
1007   dbgs() << "LV: " << Prefix << DebugMsg;
1008   if (I != nullptr)
1009     dbgs() << " " << *I;
1010   else
1011     dbgs() << '.';
1012   dbgs() << '\n';
1013 }
1014 #endif
1015 
1016 /// Create an analysis remark that explains why vectorization failed
1017 ///
1018 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1019 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1020 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1021 /// the location of the remark.  \return the remark object that can be
1022 /// streamed to.
1023 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1024     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1025   Value *CodeRegion = TheLoop->getHeader();
1026   DebugLoc DL = TheLoop->getStartLoc();
1027 
1028   if (I) {
1029     CodeRegion = I->getParent();
1030     // If there is no debug location attached to the instruction, revert back to
1031     // using the loop's.
1032     if (I->getDebugLoc())
1033       DL = I->getDebugLoc();
1034   }
1035 
1036   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1037 }
1038 
1039 namespace llvm {
1040 
1041 /// Return a value for Step multiplied by VF.
1042 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
1043                        int64_t Step) {
1044   assert(Ty->isIntegerTy() && "Expected an integer step");
1045   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
1046   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1047 }
1048 
1049 /// Return the runtime value for VF.
1050 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
1051   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1052   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1053 }
1054 
1055 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
1056                                   ElementCount VF) {
1057   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1058   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1059   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1060   return B.CreateUIToFP(RuntimeVF, FTy);
1061 }
1062 
1063 void reportVectorizationFailure(const StringRef DebugMsg,
1064                                 const StringRef OREMsg, const StringRef ORETag,
1065                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1066                                 Instruction *I) {
1067   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1068   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1069   ORE->emit(
1070       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1071       << "loop not vectorized: " << OREMsg);
1072 }
1073 
1074 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1075                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1076                              Instruction *I) {
1077   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1078   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1079   ORE->emit(
1080       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1081       << Msg);
1082 }
1083 
1084 } // end namespace llvm
1085 
1086 #ifndef NDEBUG
1087 /// \return string containing a file name and a line # for the given loop.
1088 static std::string getDebugLocString(const Loop *L) {
1089   std::string Result;
1090   if (L) {
1091     raw_string_ostream OS(Result);
1092     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1093       LoopDbgLoc.print(OS);
1094     else
1095       // Just print the module name.
1096       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1097     OS.flush();
1098   }
1099   return Result;
1100 }
1101 #endif
1102 
1103 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1104                                          const Instruction *Orig) {
1105   // If the loop was versioned with memchecks, add the corresponding no-alias
1106   // metadata.
1107   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1108     LVer->annotateInstWithNoAlias(To, Orig);
1109 }
1110 
1111 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1112     VPTransformState &State) {
1113 
1114   // Collect recipes in the backward slice of `Root` that may generate a poison
1115   // value that is used after vectorization.
1116   SmallPtrSet<VPRecipeBase *, 16> Visited;
1117   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1118     SmallVector<VPRecipeBase *, 16> Worklist;
1119     Worklist.push_back(Root);
1120 
1121     // Traverse the backward slice of Root through its use-def chain.
1122     while (!Worklist.empty()) {
1123       VPRecipeBase *CurRec = Worklist.back();
1124       Worklist.pop_back();
1125 
1126       if (!Visited.insert(CurRec).second)
1127         continue;
1128 
1129       // Prune search if we find another recipe generating a widen memory
1130       // instruction. Widen memory instructions involved in address computation
1131       // will lead to gather/scatter instructions, which don't need to be
1132       // handled.
1133       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1134           isa<VPInterleaveRecipe>(CurRec) ||
1135           isa<VPCanonicalIVPHIRecipe>(CurRec))
1136         continue;
1137 
1138       // This recipe contributes to the address computation of a widen
1139       // load/store. Collect recipe if its underlying instruction has
1140       // poison-generating flags.
1141       Instruction *Instr = CurRec->getUnderlyingInstr();
1142       if (Instr && Instr->hasPoisonGeneratingFlags())
1143         State.MayGeneratePoisonRecipes.insert(CurRec);
1144 
1145       // Add new definitions to the worklist.
1146       for (VPValue *operand : CurRec->operands())
1147         if (VPDef *OpDef = operand->getDef())
1148           Worklist.push_back(cast<VPRecipeBase>(OpDef));
1149     }
1150   });
1151 
1152   // Traverse all the recipes in the VPlan and collect the poison-generating
1153   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1154   // VPInterleaveRecipe.
1155   auto Iter = depth_first(
1156       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1157   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1158     for (VPRecipeBase &Recipe : *VPBB) {
1159       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1160         Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
1161         VPDef *AddrDef = WidenRec->getAddr()->getDef();
1162         if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
1163             Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
1164           collectPoisonGeneratingInstrsInBackwardSlice(
1165               cast<VPRecipeBase>(AddrDef));
1166       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1167         VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1168         if (AddrDef) {
1169           // Check if any member of the interleave group needs predication.
1170           const InterleaveGroup<Instruction> *InterGroup =
1171               InterleaveRec->getInterleaveGroup();
1172           bool NeedPredication = false;
1173           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1174                I < NumMembers; ++I) {
1175             Instruction *Member = InterGroup->getMember(I);
1176             if (Member)
1177               NeedPredication |=
1178                   Legal->blockNeedsPredication(Member->getParent());
1179           }
1180 
1181           if (NeedPredication)
1182             collectPoisonGeneratingInstrsInBackwardSlice(
1183                 cast<VPRecipeBase>(AddrDef));
1184         }
1185       }
1186     }
1187   }
1188 }
1189 
1190 void InnerLoopVectorizer::addMetadata(Instruction *To,
1191                                       Instruction *From) {
1192   propagateMetadata(To, From);
1193   addNewMetadata(To, From);
1194 }
1195 
1196 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1197                                       Instruction *From) {
1198   for (Value *V : To) {
1199     if (Instruction *I = dyn_cast<Instruction>(V))
1200       addMetadata(I, From);
1201   }
1202 }
1203 
1204 PHINode *InnerLoopVectorizer::getReductionResumeValue(
1205     const RecurrenceDescriptor &RdxDesc) {
1206   auto It = ReductionResumeValues.find(&RdxDesc);
1207   assert(It != ReductionResumeValues.end() &&
1208          "Expected to find a resume value for the reduction.");
1209   return It->second;
1210 }
1211 
1212 namespace llvm {
1213 
1214 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1215 // lowered.
1216 enum ScalarEpilogueLowering {
1217 
1218   // The default: allowing scalar epilogues.
1219   CM_ScalarEpilogueAllowed,
1220 
1221   // Vectorization with OptForSize: don't allow epilogues.
1222   CM_ScalarEpilogueNotAllowedOptSize,
1223 
1224   // A special case of vectorisation with OptForSize: loops with a very small
1225   // trip count are considered for vectorization under OptForSize, thereby
1226   // making sure the cost of their loop body is dominant, free of runtime
1227   // guards and scalar iteration overheads.
1228   CM_ScalarEpilogueNotAllowedLowTripLoop,
1229 
1230   // Loop hint predicate indicating an epilogue is undesired.
1231   CM_ScalarEpilogueNotNeededUsePredicate,
1232 
1233   // Directive indicating we must either tail fold or not vectorize
1234   CM_ScalarEpilogueNotAllowedUsePredicate
1235 };
1236 
1237 /// ElementCountComparator creates a total ordering for ElementCount
1238 /// for the purposes of using it in a set structure.
1239 struct ElementCountComparator {
1240   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1241     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1242            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1243   }
1244 };
1245 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1246 
1247 /// LoopVectorizationCostModel - estimates the expected speedups due to
1248 /// vectorization.
1249 /// In many cases vectorization is not profitable. This can happen because of
1250 /// a number of reasons. In this class we mainly attempt to predict the
1251 /// expected speedup/slowdowns due to the supported instruction set. We use the
1252 /// TargetTransformInfo to query the different backends for the cost of
1253 /// different operations.
1254 class LoopVectorizationCostModel {
1255 public:
1256   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1257                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1258                              LoopVectorizationLegality *Legal,
1259                              const TargetTransformInfo &TTI,
1260                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1261                              AssumptionCache *AC,
1262                              OptimizationRemarkEmitter *ORE, const Function *F,
1263                              const LoopVectorizeHints *Hints,
1264                              InterleavedAccessInfo &IAI)
1265       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1266         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1267         Hints(Hints), InterleaveInfo(IAI) {}
1268 
1269   /// \return An upper bound for the vectorization factors (both fixed and
1270   /// scalable). If the factors are 0, vectorization and interleaving should be
1271   /// avoided up front.
1272   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1273 
1274   /// \return True if runtime checks are required for vectorization, and false
1275   /// otherwise.
1276   bool runtimeChecksRequired();
1277 
1278   /// \return The most profitable vectorization factor and the cost of that VF.
1279   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1280   /// then this vectorization factor will be selected if vectorization is
1281   /// possible.
1282   VectorizationFactor
1283   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1284 
1285   VectorizationFactor
1286   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1287                                     const LoopVectorizationPlanner &LVP);
1288 
1289   /// Setup cost-based decisions for user vectorization factor.
1290   /// \return true if the UserVF is a feasible VF to be chosen.
1291   bool selectUserVectorizationFactor(ElementCount UserVF) {
1292     collectUniformsAndScalars(UserVF);
1293     collectInstsToScalarize(UserVF);
1294     return expectedCost(UserVF).first.isValid();
1295   }
1296 
1297   /// \return The size (in bits) of the smallest and widest types in the code
1298   /// that needs to be vectorized. We ignore values that remain scalar such as
1299   /// 64 bit loop indices.
1300   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1301 
1302   /// \return The desired interleave count.
1303   /// If interleave count has been specified by metadata it will be returned.
1304   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1305   /// are the selected vectorization factor and the cost of the selected VF.
1306   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1307 
1308   /// Memory access instruction may be vectorized in more than one way.
1309   /// Form of instruction after vectorization depends on cost.
1310   /// This function takes cost-based decisions for Load/Store instructions
1311   /// and collects them in a map. This decisions map is used for building
1312   /// the lists of loop-uniform and loop-scalar instructions.
1313   /// The calculated cost is saved with widening decision in order to
1314   /// avoid redundant calculations.
1315   void setCostBasedWideningDecision(ElementCount VF);
1316 
1317   /// A struct that represents some properties of the register usage
1318   /// of a loop.
1319   struct RegisterUsage {
1320     /// Holds the number of loop invariant values that are used in the loop.
1321     /// The key is ClassID of target-provided register class.
1322     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1323     /// Holds the maximum number of concurrent live intervals in the loop.
1324     /// The key is ClassID of target-provided register class.
1325     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1326   };
1327 
1328   /// \return Returns information about the register usages of the loop for the
1329   /// given vectorization factors.
1330   SmallVector<RegisterUsage, 8>
1331   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1332 
1333   /// Collect values we want to ignore in the cost model.
1334   void collectValuesToIgnore();
1335 
1336   /// Collect all element types in the loop for which widening is needed.
1337   void collectElementTypesForWidening();
1338 
1339   /// Split reductions into those that happen in the loop, and those that happen
1340   /// outside. In loop reductions are collected into InLoopReductionChains.
1341   void collectInLoopReductions();
1342 
1343   /// Returns true if we should use strict in-order reductions for the given
1344   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1345   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1346   /// of FP operations.
1347   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1348     return !Hints->allowReordering() && RdxDesc.isOrdered();
1349   }
1350 
1351   /// \returns The smallest bitwidth each instruction can be represented with.
1352   /// The vector equivalents of these instructions should be truncated to this
1353   /// type.
1354   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1355     return MinBWs;
1356   }
1357 
1358   /// \returns True if it is more profitable to scalarize instruction \p I for
1359   /// vectorization factor \p VF.
1360   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1361     assert(VF.isVector() &&
1362            "Profitable to scalarize relevant only for VF > 1.");
1363 
1364     // Cost model is not run in the VPlan-native path - return conservative
1365     // result until this changes.
1366     if (EnableVPlanNativePath)
1367       return false;
1368 
1369     auto Scalars = InstsToScalarize.find(VF);
1370     assert(Scalars != InstsToScalarize.end() &&
1371            "VF not yet analyzed for scalarization profitability");
1372     return Scalars->second.find(I) != Scalars->second.end();
1373   }
1374 
1375   /// Returns true if \p I is known to be uniform after vectorization.
1376   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1377     if (VF.isScalar())
1378       return true;
1379 
1380     // Cost model is not run in the VPlan-native path - return conservative
1381     // result until this changes.
1382     if (EnableVPlanNativePath)
1383       return false;
1384 
1385     auto UniformsPerVF = Uniforms.find(VF);
1386     assert(UniformsPerVF != Uniforms.end() &&
1387            "VF not yet analyzed for uniformity");
1388     return UniformsPerVF->second.count(I);
1389   }
1390 
1391   /// Returns true if \p I is known to be scalar after vectorization.
1392   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1393     if (VF.isScalar())
1394       return true;
1395 
1396     // Cost model is not run in the VPlan-native path - return conservative
1397     // result until this changes.
1398     if (EnableVPlanNativePath)
1399       return false;
1400 
1401     auto ScalarsPerVF = Scalars.find(VF);
1402     assert(ScalarsPerVF != Scalars.end() &&
1403            "Scalar values are not calculated for VF");
1404     return ScalarsPerVF->second.count(I);
1405   }
1406 
1407   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1408   /// for vectorization factor \p VF.
1409   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1410     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1411            !isProfitableToScalarize(I, VF) &&
1412            !isScalarAfterVectorization(I, VF);
1413   }
1414 
1415   /// Decision that was taken during cost calculation for memory instruction.
1416   enum InstWidening {
1417     CM_Unknown,
1418     CM_Widen,         // For consecutive accesses with stride +1.
1419     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1420     CM_Interleave,
1421     CM_GatherScatter,
1422     CM_Scalarize
1423   };
1424 
1425   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1426   /// instruction \p I and vector width \p VF.
1427   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1428                            InstructionCost Cost) {
1429     assert(VF.isVector() && "Expected VF >=2");
1430     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1431   }
1432 
1433   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1434   /// interleaving group \p Grp and vector width \p VF.
1435   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1436                            ElementCount VF, InstWidening W,
1437                            InstructionCost Cost) {
1438     assert(VF.isVector() && "Expected VF >=2");
1439     /// Broadcast this decicion to all instructions inside the group.
1440     /// But the cost will be assigned to one instruction only.
1441     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1442       if (auto *I = Grp->getMember(i)) {
1443         if (Grp->getInsertPos() == I)
1444           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1445         else
1446           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1447       }
1448     }
1449   }
1450 
1451   /// Return the cost model decision for the given instruction \p I and vector
1452   /// width \p VF. Return CM_Unknown if this instruction did not pass
1453   /// through the cost modeling.
1454   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1455     assert(VF.isVector() && "Expected VF to be a vector VF");
1456     // Cost model is not run in the VPlan-native path - return conservative
1457     // result until this changes.
1458     if (EnableVPlanNativePath)
1459       return CM_GatherScatter;
1460 
1461     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1462     auto Itr = WideningDecisions.find(InstOnVF);
1463     if (Itr == WideningDecisions.end())
1464       return CM_Unknown;
1465     return Itr->second.first;
1466   }
1467 
1468   /// Return the vectorization cost for the given instruction \p I and vector
1469   /// width \p VF.
1470   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1471     assert(VF.isVector() && "Expected VF >=2");
1472     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1473     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1474            "The cost is not calculated");
1475     return WideningDecisions[InstOnVF].second;
1476   }
1477 
1478   /// Return True if instruction \p I is an optimizable truncate whose operand
1479   /// is an induction variable. Such a truncate will be removed by adding a new
1480   /// induction variable with the destination type.
1481   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1482     // If the instruction is not a truncate, return false.
1483     auto *Trunc = dyn_cast<TruncInst>(I);
1484     if (!Trunc)
1485       return false;
1486 
1487     // Get the source and destination types of the truncate.
1488     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1489     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1490 
1491     // If the truncate is free for the given types, return false. Replacing a
1492     // free truncate with an induction variable would add an induction variable
1493     // update instruction to each iteration of the loop. We exclude from this
1494     // check the primary induction variable since it will need an update
1495     // instruction regardless.
1496     Value *Op = Trunc->getOperand(0);
1497     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1498       return false;
1499 
1500     // If the truncated value is not an induction variable, return false.
1501     return Legal->isInductionPhi(Op);
1502   }
1503 
1504   /// Collects the instructions to scalarize for each predicated instruction in
1505   /// the loop.
1506   void collectInstsToScalarize(ElementCount VF);
1507 
1508   /// Collect Uniform and Scalar values for the given \p VF.
1509   /// The sets depend on CM decision for Load/Store instructions
1510   /// that may be vectorized as interleave, gather-scatter or scalarized.
1511   void collectUniformsAndScalars(ElementCount VF) {
1512     // Do the analysis once.
1513     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1514       return;
1515     setCostBasedWideningDecision(VF);
1516     collectLoopUniforms(VF);
1517     collectLoopScalars(VF);
1518   }
1519 
1520   /// Returns true if the target machine supports masked store operation
1521   /// for the given \p DataType and kind of access to \p Ptr.
1522   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1523     return Legal->isConsecutivePtr(DataType, Ptr) &&
1524            TTI.isLegalMaskedStore(DataType, Alignment);
1525   }
1526 
1527   /// Returns true if the target machine supports masked load operation
1528   /// for the given \p DataType and kind of access to \p Ptr.
1529   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1530     return Legal->isConsecutivePtr(DataType, Ptr) &&
1531            TTI.isLegalMaskedLoad(DataType, Alignment);
1532   }
1533 
1534   /// Returns true if the target machine can represent \p V as a masked gather
1535   /// or scatter operation.
1536   bool isLegalGatherOrScatter(Value *V,
1537                               ElementCount VF = ElementCount::getFixed(1)) {
1538     bool LI = isa<LoadInst>(V);
1539     bool SI = isa<StoreInst>(V);
1540     if (!LI && !SI)
1541       return false;
1542     auto *Ty = getLoadStoreType(V);
1543     Align Align = getLoadStoreAlignment(V);
1544     if (VF.isVector())
1545       Ty = VectorType::get(Ty, VF);
1546     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1547            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1548   }
1549 
1550   /// Returns true if the target machine supports all of the reduction
1551   /// variables found for the given VF.
1552   bool canVectorizeReductions(ElementCount VF) const {
1553     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1554       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1555       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1556     }));
1557   }
1558 
1559   /// Returns true if \p I is an instruction that will be scalarized with
1560   /// predication when vectorizing \p I with vectorization factor \p VF. Such
1561   /// instructions include conditional stores and instructions that may divide
1562   /// by zero.
1563   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1564 
1565   // Returns true if \p I is an instruction that will be predicated either
1566   // through scalar predication or masked load/store or masked gather/scatter.
1567   // \p VF is the vectorization factor that will be used to vectorize \p I.
1568   // Superset of instructions that return true for isScalarWithPredication.
1569   bool isPredicatedInst(Instruction *I, ElementCount VF,
1570                         bool IsKnownUniform = false) {
1571     // When we know the load is uniform and the original scalar loop was not
1572     // predicated we don't need to mark it as a predicated instruction. Any
1573     // vectorised blocks created when tail-folding are something artificial we
1574     // have introduced and we know there is always at least one active lane.
1575     // That's why we call Legal->blockNeedsPredication here because it doesn't
1576     // query tail-folding.
1577     if (IsKnownUniform && isa<LoadInst>(I) &&
1578         !Legal->blockNeedsPredication(I->getParent()))
1579       return false;
1580     if (!blockNeedsPredicationForAnyReason(I->getParent()))
1581       return false;
1582     // Loads and stores that need some form of masked operation are predicated
1583     // instructions.
1584     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1585       return Legal->isMaskRequired(I);
1586     return isScalarWithPredication(I, VF);
1587   }
1588 
1589   /// Returns true if \p I is a memory instruction with consecutive memory
1590   /// access that can be widened.
1591   bool
1592   memoryInstructionCanBeWidened(Instruction *I,
1593                                 ElementCount VF = ElementCount::getFixed(1));
1594 
1595   /// Returns true if \p I is a memory instruction in an interleaved-group
1596   /// of memory accesses that can be vectorized with wide vector loads/stores
1597   /// and shuffles.
1598   bool
1599   interleavedAccessCanBeWidened(Instruction *I,
1600                                 ElementCount VF = ElementCount::getFixed(1));
1601 
1602   /// Check if \p Instr belongs to any interleaved access group.
1603   bool isAccessInterleaved(Instruction *Instr) {
1604     return InterleaveInfo.isInterleaved(Instr);
1605   }
1606 
1607   /// Get the interleaved access group that \p Instr belongs to.
1608   const InterleaveGroup<Instruction> *
1609   getInterleavedAccessGroup(Instruction *Instr) {
1610     return InterleaveInfo.getInterleaveGroup(Instr);
1611   }
1612 
1613   /// Returns true if we're required to use a scalar epilogue for at least
1614   /// the final iteration of the original loop.
1615   bool requiresScalarEpilogue(ElementCount VF) const {
1616     if (!isScalarEpilogueAllowed())
1617       return false;
1618     // If we might exit from anywhere but the latch, must run the exiting
1619     // iteration in scalar form.
1620     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1621       return true;
1622     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1623   }
1624 
1625   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1626   /// loop hint annotation.
1627   bool isScalarEpilogueAllowed() const {
1628     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1629   }
1630 
1631   /// Returns true if all loop blocks should be masked to fold tail loop.
1632   bool foldTailByMasking() const { return FoldTailByMasking; }
1633 
1634   /// Returns true if the instructions in this block requires predication
1635   /// for any reason, e.g. because tail folding now requires a predicate
1636   /// or because the block in the original loop was predicated.
1637   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1638     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1639   }
1640 
1641   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1642   /// nodes to the chain of instructions representing the reductions. Uses a
1643   /// MapVector to ensure deterministic iteration order.
1644   using ReductionChainMap =
1645       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1646 
1647   /// Return the chain of instructions representing an inloop reduction.
1648   const ReductionChainMap &getInLoopReductionChains() const {
1649     return InLoopReductionChains;
1650   }
1651 
1652   /// Returns true if the Phi is part of an inloop reduction.
1653   bool isInLoopReduction(PHINode *Phi) const {
1654     return InLoopReductionChains.count(Phi);
1655   }
1656 
1657   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1658   /// with factor VF.  Return the cost of the instruction, including
1659   /// scalarization overhead if it's needed.
1660   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1661 
1662   /// Estimate cost of a call instruction CI if it were vectorized with factor
1663   /// VF. Return the cost of the instruction, including scalarization overhead
1664   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1665   /// scalarized -
1666   /// i.e. either vector version isn't available, or is too expensive.
1667   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1668                                     bool &NeedToScalarize) const;
1669 
1670   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1671   /// that of B.
1672   bool isMoreProfitable(const VectorizationFactor &A,
1673                         const VectorizationFactor &B) const;
1674 
1675   /// Invalidates decisions already taken by the cost model.
1676   void invalidateCostModelingDecisions() {
1677     WideningDecisions.clear();
1678     Uniforms.clear();
1679     Scalars.clear();
1680   }
1681 
1682 private:
1683   unsigned NumPredStores = 0;
1684 
1685   /// Convenience function that returns the value of vscale_range iff
1686   /// vscale_range.min == vscale_range.max or otherwise returns the value
1687   /// returned by the corresponding TLI method.
1688   Optional<unsigned> getVScaleForTuning() const;
1689 
1690   /// \return An upper bound for the vectorization factors for both
1691   /// fixed and scalable vectorization, where the minimum-known number of
1692   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1693   /// disabled or unsupported, then the scalable part will be equal to
1694   /// ElementCount::getScalable(0).
1695   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1696                                            ElementCount UserVF,
1697                                            bool FoldTailByMasking);
1698 
1699   /// \return the maximized element count based on the targets vector
1700   /// registers and the loop trip-count, but limited to a maximum safe VF.
1701   /// This is a helper function of computeFeasibleMaxVF.
1702   /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1703   /// issue that occurred on one of the buildbots which cannot be reproduced
1704   /// without having access to the properietary compiler (see comments on
1705   /// D98509). The issue is currently under investigation and this workaround
1706   /// will be removed as soon as possible.
1707   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1708                                        unsigned SmallestType,
1709                                        unsigned WidestType,
1710                                        const ElementCount &MaxSafeVF,
1711                                        bool FoldTailByMasking);
1712 
1713   /// \return the maximum legal scalable VF, based on the safe max number
1714   /// of elements.
1715   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1716 
1717   /// The vectorization cost is a combination of the cost itself and a boolean
1718   /// indicating whether any of the contributing operations will actually
1719   /// operate on vector values after type legalization in the backend. If this
1720   /// latter value is false, then all operations will be scalarized (i.e. no
1721   /// vectorization has actually taken place).
1722   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1723 
1724   /// Returns the expected execution cost. The unit of the cost does
1725   /// not matter because we use the 'cost' units to compare different
1726   /// vector widths. The cost that is returned is *not* normalized by
1727   /// the factor width. If \p Invalid is not nullptr, this function
1728   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1729   /// each instruction that has an Invalid cost for the given VF.
1730   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1731   VectorizationCostTy
1732   expectedCost(ElementCount VF,
1733                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1734 
1735   /// Returns the execution time cost of an instruction for a given vector
1736   /// width. Vector width of one means scalar.
1737   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1738 
1739   /// The cost-computation logic from getInstructionCost which provides
1740   /// the vector type as an output parameter.
1741   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1742                                      Type *&VectorTy);
1743 
1744   /// Return the cost of instructions in an inloop reduction pattern, if I is
1745   /// part of that pattern.
1746   Optional<InstructionCost>
1747   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1748                           TTI::TargetCostKind CostKind);
1749 
1750   /// Calculate vectorization cost of memory instruction \p I.
1751   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1752 
1753   /// The cost computation for scalarized memory instruction.
1754   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1755 
1756   /// The cost computation for interleaving group of memory instructions.
1757   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1758 
1759   /// The cost computation for Gather/Scatter instruction.
1760   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1761 
1762   /// The cost computation for widening instruction \p I with consecutive
1763   /// memory access.
1764   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1765 
1766   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1767   /// Load: scalar load + broadcast.
1768   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1769   /// element)
1770   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1771 
1772   /// Estimate the overhead of scalarizing an instruction. This is a
1773   /// convenience wrapper for the type-based getScalarizationOverhead API.
1774   InstructionCost getScalarizationOverhead(Instruction *I,
1775                                            ElementCount VF) const;
1776 
1777   /// Returns whether the instruction is a load or store and will be a emitted
1778   /// as a vector operation.
1779   bool isConsecutiveLoadOrStore(Instruction *I);
1780 
1781   /// Returns true if an artificially high cost for emulated masked memrefs
1782   /// should be used.
1783   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1784 
1785   /// Map of scalar integer values to the smallest bitwidth they can be legally
1786   /// represented as. The vector equivalents of these values should be truncated
1787   /// to this type.
1788   MapVector<Instruction *, uint64_t> MinBWs;
1789 
1790   /// A type representing the costs for instructions if they were to be
1791   /// scalarized rather than vectorized. The entries are Instruction-Cost
1792   /// pairs.
1793   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1794 
1795   /// A set containing all BasicBlocks that are known to present after
1796   /// vectorization as a predicated block.
1797   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1798 
1799   /// Records whether it is allowed to have the original scalar loop execute at
1800   /// least once. This may be needed as a fallback loop in case runtime
1801   /// aliasing/dependence checks fail, or to handle the tail/remainder
1802   /// iterations when the trip count is unknown or doesn't divide by the VF,
1803   /// or as a peel-loop to handle gaps in interleave-groups.
1804   /// Under optsize and when the trip count is very small we don't allow any
1805   /// iterations to execute in the scalar loop.
1806   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1807 
1808   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1809   bool FoldTailByMasking = false;
1810 
1811   /// A map holding scalar costs for different vectorization factors. The
1812   /// presence of a cost for an instruction in the mapping indicates that the
1813   /// instruction will be scalarized when vectorizing with the associated
1814   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1815   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1816 
1817   /// Holds the instructions known to be uniform after vectorization.
1818   /// The data is collected per VF.
1819   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1820 
1821   /// Holds the instructions known to be scalar after vectorization.
1822   /// The data is collected per VF.
1823   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1824 
1825   /// Holds the instructions (address computations) that are forced to be
1826   /// scalarized.
1827   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1828 
1829   /// PHINodes of the reductions that should be expanded in-loop along with
1830   /// their associated chains of reduction operations, in program order from top
1831   /// (PHI) to bottom
1832   ReductionChainMap InLoopReductionChains;
1833 
1834   /// A Map of inloop reduction operations and their immediate chain operand.
1835   /// FIXME: This can be removed once reductions can be costed correctly in
1836   /// vplan. This was added to allow quick lookup to the inloop operations,
1837   /// without having to loop through InLoopReductionChains.
1838   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1839 
1840   /// Returns the expected difference in cost from scalarizing the expression
1841   /// feeding a predicated instruction \p PredInst. The instructions to
1842   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1843   /// non-negative return value implies the expression will be scalarized.
1844   /// Currently, only single-use chains are considered for scalarization.
1845   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1846                               ElementCount VF);
1847 
1848   /// Collect the instructions that are uniform after vectorization. An
1849   /// instruction is uniform if we represent it with a single scalar value in
1850   /// the vectorized loop corresponding to each vector iteration. Examples of
1851   /// uniform instructions include pointer operands of consecutive or
1852   /// interleaved memory accesses. Note that although uniformity implies an
1853   /// instruction will be scalar, the reverse is not true. In general, a
1854   /// scalarized instruction will be represented by VF scalar values in the
1855   /// vectorized loop, each corresponding to an iteration of the original
1856   /// scalar loop.
1857   void collectLoopUniforms(ElementCount VF);
1858 
1859   /// Collect the instructions that are scalar after vectorization. An
1860   /// instruction is scalar if it is known to be uniform or will be scalarized
1861   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1862   /// to the list if they are used by a load/store instruction that is marked as
1863   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1864   /// VF values in the vectorized loop, each corresponding to an iteration of
1865   /// the original scalar loop.
1866   void collectLoopScalars(ElementCount VF);
1867 
1868   /// Keeps cost model vectorization decision and cost for instructions.
1869   /// Right now it is used for memory instructions only.
1870   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1871                                 std::pair<InstWidening, InstructionCost>>;
1872 
1873   DecisionList WideningDecisions;
1874 
1875   /// Returns true if \p V is expected to be vectorized and it needs to be
1876   /// extracted.
1877   bool needsExtract(Value *V, ElementCount VF) const {
1878     Instruction *I = dyn_cast<Instruction>(V);
1879     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1880         TheLoop->isLoopInvariant(I))
1881       return false;
1882 
1883     // Assume we can vectorize V (and hence we need extraction) if the
1884     // scalars are not computed yet. This can happen, because it is called
1885     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1886     // the scalars are collected. That should be a safe assumption in most
1887     // cases, because we check if the operands have vectorizable types
1888     // beforehand in LoopVectorizationLegality.
1889     return Scalars.find(VF) == Scalars.end() ||
1890            !isScalarAfterVectorization(I, VF);
1891   };
1892 
1893   /// Returns a range containing only operands needing to be extracted.
1894   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1895                                                    ElementCount VF) const {
1896     return SmallVector<Value *, 4>(make_filter_range(
1897         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1898   }
1899 
1900   /// Determines if we have the infrastructure to vectorize loop \p L and its
1901   /// epilogue, assuming the main loop is vectorized by \p VF.
1902   bool isCandidateForEpilogueVectorization(const Loop &L,
1903                                            const ElementCount VF) const;
1904 
1905   /// Returns true if epilogue vectorization is considered profitable, and
1906   /// false otherwise.
1907   /// \p VF is the vectorization factor chosen for the original loop.
1908   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1909 
1910 public:
1911   /// The loop that we evaluate.
1912   Loop *TheLoop;
1913 
1914   /// Predicated scalar evolution analysis.
1915   PredicatedScalarEvolution &PSE;
1916 
1917   /// Loop Info analysis.
1918   LoopInfo *LI;
1919 
1920   /// Vectorization legality.
1921   LoopVectorizationLegality *Legal;
1922 
1923   /// Vector target information.
1924   const TargetTransformInfo &TTI;
1925 
1926   /// Target Library Info.
1927   const TargetLibraryInfo *TLI;
1928 
1929   /// Demanded bits analysis.
1930   DemandedBits *DB;
1931 
1932   /// Assumption cache.
1933   AssumptionCache *AC;
1934 
1935   /// Interface to emit optimization remarks.
1936   OptimizationRemarkEmitter *ORE;
1937 
1938   const Function *TheFunction;
1939 
1940   /// Loop Vectorize Hint.
1941   const LoopVectorizeHints *Hints;
1942 
1943   /// The interleave access information contains groups of interleaved accesses
1944   /// with the same stride and close to each other.
1945   InterleavedAccessInfo &InterleaveInfo;
1946 
1947   /// Values to ignore in the cost model.
1948   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1949 
1950   /// Values to ignore in the cost model when VF > 1.
1951   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1952 
1953   /// All element types found in the loop.
1954   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1955 
1956   /// Profitable vector factors.
1957   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1958 };
1959 } // end namespace llvm
1960 
1961 /// Helper struct to manage generating runtime checks for vectorization.
1962 ///
1963 /// The runtime checks are created up-front in temporary blocks to allow better
1964 /// estimating the cost and un-linked from the existing IR. After deciding to
1965 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1966 /// temporary blocks are completely removed.
1967 class GeneratedRTChecks {
1968   /// Basic block which contains the generated SCEV checks, if any.
1969   BasicBlock *SCEVCheckBlock = nullptr;
1970 
1971   /// The value representing the result of the generated SCEV checks. If it is
1972   /// nullptr, either no SCEV checks have been generated or they have been used.
1973   Value *SCEVCheckCond = nullptr;
1974 
1975   /// Basic block which contains the generated memory runtime checks, if any.
1976   BasicBlock *MemCheckBlock = nullptr;
1977 
1978   /// The value representing the result of the generated memory runtime checks.
1979   /// If it is nullptr, either no memory runtime checks have been generated or
1980   /// they have been used.
1981   Value *MemRuntimeCheckCond = nullptr;
1982 
1983   DominatorTree *DT;
1984   LoopInfo *LI;
1985 
1986   SCEVExpander SCEVExp;
1987   SCEVExpander MemCheckExp;
1988 
1989 public:
1990   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1991                     const DataLayout &DL)
1992       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1993         MemCheckExp(SE, DL, "scev.check") {}
1994 
1995   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1996   /// accurately estimate the cost of the runtime checks. The blocks are
1997   /// un-linked from the IR and is added back during vector code generation. If
1998   /// there is no vector code generation, the check blocks are removed
1999   /// completely.
2000   void Create(Loop *L, const LoopAccessInfo &LAI,
2001               const SCEVUnionPredicate &UnionPred) {
2002 
2003     BasicBlock *LoopHeader = L->getHeader();
2004     BasicBlock *Preheader = L->getLoopPreheader();
2005 
2006     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
2007     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
2008     // may be used by SCEVExpander. The blocks will be un-linked from their
2009     // predecessors and removed from LI & DT at the end of the function.
2010     if (!UnionPred.isAlwaysTrue()) {
2011       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
2012                                   nullptr, "vector.scevcheck");
2013 
2014       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
2015           &UnionPred, SCEVCheckBlock->getTerminator());
2016     }
2017 
2018     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
2019     if (RtPtrChecking.Need) {
2020       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2021       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2022                                  "vector.memcheck");
2023 
2024       MemRuntimeCheckCond =
2025           addRuntimeChecks(MemCheckBlock->getTerminator(), L,
2026                            RtPtrChecking.getChecks(), MemCheckExp);
2027       assert(MemRuntimeCheckCond &&
2028              "no RT checks generated although RtPtrChecking "
2029              "claimed checks are required");
2030     }
2031 
2032     if (!MemCheckBlock && !SCEVCheckBlock)
2033       return;
2034 
2035     // Unhook the temporary block with the checks, update various places
2036     // accordingly.
2037     if (SCEVCheckBlock)
2038       SCEVCheckBlock->replaceAllUsesWith(Preheader);
2039     if (MemCheckBlock)
2040       MemCheckBlock->replaceAllUsesWith(Preheader);
2041 
2042     if (SCEVCheckBlock) {
2043       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2044       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2045       Preheader->getTerminator()->eraseFromParent();
2046     }
2047     if (MemCheckBlock) {
2048       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2049       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2050       Preheader->getTerminator()->eraseFromParent();
2051     }
2052 
2053     DT->changeImmediateDominator(LoopHeader, Preheader);
2054     if (MemCheckBlock) {
2055       DT->eraseNode(MemCheckBlock);
2056       LI->removeBlock(MemCheckBlock);
2057     }
2058     if (SCEVCheckBlock) {
2059       DT->eraseNode(SCEVCheckBlock);
2060       LI->removeBlock(SCEVCheckBlock);
2061     }
2062   }
2063 
2064   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2065   /// unused.
2066   ~GeneratedRTChecks() {
2067     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2068     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2069     if (!SCEVCheckCond)
2070       SCEVCleaner.markResultUsed();
2071 
2072     if (!MemRuntimeCheckCond)
2073       MemCheckCleaner.markResultUsed();
2074 
2075     if (MemRuntimeCheckCond) {
2076       auto &SE = *MemCheckExp.getSE();
2077       // Memory runtime check generation creates compares that use expanded
2078       // values. Remove them before running the SCEVExpanderCleaners.
2079       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2080         if (MemCheckExp.isInsertedInstruction(&I))
2081           continue;
2082         SE.forgetValue(&I);
2083         I.eraseFromParent();
2084       }
2085     }
2086     MemCheckCleaner.cleanup();
2087     SCEVCleaner.cleanup();
2088 
2089     if (SCEVCheckCond)
2090       SCEVCheckBlock->eraseFromParent();
2091     if (MemRuntimeCheckCond)
2092       MemCheckBlock->eraseFromParent();
2093   }
2094 
2095   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2096   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2097   /// depending on the generated condition.
2098   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
2099                              BasicBlock *LoopVectorPreHeader,
2100                              BasicBlock *LoopExitBlock) {
2101     if (!SCEVCheckCond)
2102       return nullptr;
2103     if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2104       if (C->isZero())
2105         return nullptr;
2106 
2107     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2108 
2109     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2110     // Create new preheader for vector loop.
2111     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2112       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2113 
2114     SCEVCheckBlock->getTerminator()->eraseFromParent();
2115     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2116     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2117                                                 SCEVCheckBlock);
2118 
2119     DT->addNewBlock(SCEVCheckBlock, Pred);
2120     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2121 
2122     ReplaceInstWithInst(
2123         SCEVCheckBlock->getTerminator(),
2124         BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2125     // Mark the check as used, to prevent it from being removed during cleanup.
2126     SCEVCheckCond = nullptr;
2127     return SCEVCheckBlock;
2128   }
2129 
2130   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2131   /// the branches to branch to the vector preheader or \p Bypass, depending on
2132   /// the generated condition.
2133   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2134                                    BasicBlock *LoopVectorPreHeader) {
2135     // Check if we generated code that checks in runtime if arrays overlap.
2136     if (!MemRuntimeCheckCond)
2137       return nullptr;
2138 
2139     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2140     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2141                                                 MemCheckBlock);
2142 
2143     DT->addNewBlock(MemCheckBlock, Pred);
2144     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2145     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2146 
2147     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2148       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2149 
2150     ReplaceInstWithInst(
2151         MemCheckBlock->getTerminator(),
2152         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2153     MemCheckBlock->getTerminator()->setDebugLoc(
2154         Pred->getTerminator()->getDebugLoc());
2155 
2156     // Mark the check as used, to prevent it from being removed during cleanup.
2157     MemRuntimeCheckCond = nullptr;
2158     return MemCheckBlock;
2159   }
2160 };
2161 
2162 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2163 // vectorization. The loop needs to be annotated with #pragma omp simd
2164 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2165 // vector length information is not provided, vectorization is not considered
2166 // explicit. Interleave hints are not allowed either. These limitations will be
2167 // relaxed in the future.
2168 // Please, note that we are currently forced to abuse the pragma 'clang
2169 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2170 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2171 // provides *explicit vectorization hints* (LV can bypass legal checks and
2172 // assume that vectorization is legal). However, both hints are implemented
2173 // using the same metadata (llvm.loop.vectorize, processed by
2174 // LoopVectorizeHints). This will be fixed in the future when the native IR
2175 // representation for pragma 'omp simd' is introduced.
2176 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2177                                    OptimizationRemarkEmitter *ORE) {
2178   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2179   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2180 
2181   // Only outer loops with an explicit vectorization hint are supported.
2182   // Unannotated outer loops are ignored.
2183   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2184     return false;
2185 
2186   Function *Fn = OuterLp->getHeader()->getParent();
2187   if (!Hints.allowVectorization(Fn, OuterLp,
2188                                 true /*VectorizeOnlyWhenForced*/)) {
2189     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2190     return false;
2191   }
2192 
2193   if (Hints.getInterleave() > 1) {
2194     // TODO: Interleave support is future work.
2195     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2196                          "outer loops.\n");
2197     Hints.emitRemarkWithHints();
2198     return false;
2199   }
2200 
2201   return true;
2202 }
2203 
2204 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2205                                   OptimizationRemarkEmitter *ORE,
2206                                   SmallVectorImpl<Loop *> &V) {
2207   // Collect inner loops and outer loops without irreducible control flow. For
2208   // now, only collect outer loops that have explicit vectorization hints. If we
2209   // are stress testing the VPlan H-CFG construction, we collect the outermost
2210   // loop of every loop nest.
2211   if (L.isInnermost() || VPlanBuildStressTest ||
2212       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2213     LoopBlocksRPO RPOT(&L);
2214     RPOT.perform(LI);
2215     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2216       V.push_back(&L);
2217       // TODO: Collect inner loops inside marked outer loops in case
2218       // vectorization fails for the outer loop. Do not invoke
2219       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2220       // already known to be reducible. We can use an inherited attribute for
2221       // that.
2222       return;
2223     }
2224   }
2225   for (Loop *InnerL : L)
2226     collectSupportedLoops(*InnerL, LI, ORE, V);
2227 }
2228 
2229 namespace {
2230 
2231 /// The LoopVectorize Pass.
2232 struct LoopVectorize : public FunctionPass {
2233   /// Pass identification, replacement for typeid
2234   static char ID;
2235 
2236   LoopVectorizePass Impl;
2237 
2238   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2239                          bool VectorizeOnlyWhenForced = false)
2240       : FunctionPass(ID),
2241         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2242     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2243   }
2244 
2245   bool runOnFunction(Function &F) override {
2246     if (skipFunction(F))
2247       return false;
2248 
2249     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2250     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2251     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2252     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2253     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2254     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2255     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2256     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2257     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2258     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2259     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2260     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2261     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2262 
2263     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2264         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2265 
2266     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2267                         GetLAA, *ORE, PSI).MadeAnyChange;
2268   }
2269 
2270   void getAnalysisUsage(AnalysisUsage &AU) const override {
2271     AU.addRequired<AssumptionCacheTracker>();
2272     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2273     AU.addRequired<DominatorTreeWrapperPass>();
2274     AU.addRequired<LoopInfoWrapperPass>();
2275     AU.addRequired<ScalarEvolutionWrapperPass>();
2276     AU.addRequired<TargetTransformInfoWrapperPass>();
2277     AU.addRequired<AAResultsWrapperPass>();
2278     AU.addRequired<LoopAccessLegacyAnalysis>();
2279     AU.addRequired<DemandedBitsWrapperPass>();
2280     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2281     AU.addRequired<InjectTLIMappingsLegacy>();
2282 
2283     // We currently do not preserve loopinfo/dominator analyses with outer loop
2284     // vectorization. Until this is addressed, mark these analyses as preserved
2285     // only for non-VPlan-native path.
2286     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2287     if (!EnableVPlanNativePath) {
2288       AU.addPreserved<LoopInfoWrapperPass>();
2289       AU.addPreserved<DominatorTreeWrapperPass>();
2290     }
2291 
2292     AU.addPreserved<BasicAAWrapperPass>();
2293     AU.addPreserved<GlobalsAAWrapperPass>();
2294     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2295   }
2296 };
2297 
2298 } // end anonymous namespace
2299 
2300 //===----------------------------------------------------------------------===//
2301 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2302 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2303 //===----------------------------------------------------------------------===//
2304 
2305 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2306   // We need to place the broadcast of invariant variables outside the loop,
2307   // but only if it's proven safe to do so. Else, broadcast will be inside
2308   // vector loop body.
2309   Instruction *Instr = dyn_cast<Instruction>(V);
2310   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2311                      (!Instr ||
2312                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2313   // Place the code for broadcasting invariant variables in the new preheader.
2314   IRBuilder<>::InsertPointGuard Guard(Builder);
2315   if (SafeToHoist)
2316     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2317 
2318   // Broadcast the scalar into all locations in the vector.
2319   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2320 
2321   return Shuf;
2322 }
2323 
2324 /// This function adds
2325 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2326 /// to each vector element of Val. The sequence starts at StartIndex.
2327 /// \p Opcode is relevant for FP induction variable.
2328 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2329                             Instruction::BinaryOps BinOp, ElementCount VF,
2330                             IRBuilderBase &Builder) {
2331   assert(VF.isVector() && "only vector VFs are supported");
2332 
2333   // Create and check the types.
2334   auto *ValVTy = cast<VectorType>(Val->getType());
2335   ElementCount VLen = ValVTy->getElementCount();
2336 
2337   Type *STy = Val->getType()->getScalarType();
2338   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2339          "Induction Step must be an integer or FP");
2340   assert(Step->getType() == STy && "Step has wrong type");
2341 
2342   SmallVector<Constant *, 8> Indices;
2343 
2344   // Create a vector of consecutive numbers from zero to VF.
2345   VectorType *InitVecValVTy = ValVTy;
2346   Type *InitVecValSTy = STy;
2347   if (STy->isFloatingPointTy()) {
2348     InitVecValSTy =
2349         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2350     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2351   }
2352   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2353 
2354   // Splat the StartIdx
2355   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2356 
2357   if (STy->isIntegerTy()) {
2358     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2359     Step = Builder.CreateVectorSplat(VLen, Step);
2360     assert(Step->getType() == Val->getType() && "Invalid step vec");
2361     // FIXME: The newly created binary instructions should contain nsw/nuw
2362     // flags, which can be found from the original scalar operations.
2363     Step = Builder.CreateMul(InitVec, Step);
2364     return Builder.CreateAdd(Val, Step, "induction");
2365   }
2366 
2367   // Floating point induction.
2368   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2369          "Binary Opcode should be specified for FP induction");
2370   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2371   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2372 
2373   Step = Builder.CreateVectorSplat(VLen, Step);
2374   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2375   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2376 }
2377 
2378 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2379     const InductionDescriptor &II, Value *Step, Value *Start,
2380     Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
2381   IRBuilderBase &Builder = State.Builder;
2382   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2383          "Expected either an induction phi-node or a truncate of it!");
2384 
2385   // Construct the initial value of the vector IV in the vector loop preheader
2386   auto CurrIP = Builder.saveIP();
2387   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2388   if (isa<TruncInst>(EntryVal)) {
2389     assert(Start->getType()->isIntegerTy() &&
2390            "Truncation requires an integer type");
2391     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2392     Step = Builder.CreateTrunc(Step, TruncType);
2393     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2394   }
2395 
2396   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
2397   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
2398   Value *SteppedStart = getStepVector(
2399       SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder);
2400 
2401   // We create vector phi nodes for both integer and floating-point induction
2402   // variables. Here, we determine the kind of arithmetic we will perform.
2403   Instruction::BinaryOps AddOp;
2404   Instruction::BinaryOps MulOp;
2405   if (Step->getType()->isIntegerTy()) {
2406     AddOp = Instruction::Add;
2407     MulOp = Instruction::Mul;
2408   } else {
2409     AddOp = II.getInductionOpcode();
2410     MulOp = Instruction::FMul;
2411   }
2412 
2413   // Multiply the vectorization factor by the step using integer or
2414   // floating-point arithmetic as appropriate.
2415   Type *StepType = Step->getType();
2416   Value *RuntimeVF;
2417   if (Step->getType()->isFloatingPointTy())
2418     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
2419   else
2420     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
2421   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2422 
2423   // Create a vector splat to use in the induction update.
2424   //
2425   // FIXME: If the step is non-constant, we create the vector splat with
2426   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2427   //        handle a constant vector splat.
2428   Value *SplatVF = isa<Constant>(Mul)
2429                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
2430                        : Builder.CreateVectorSplat(State.VF, Mul);
2431   Builder.restoreIP(CurrIP);
2432 
2433   // We may need to add the step a number of times, depending on the unroll
2434   // factor. The last of those goes into the PHI.
2435   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2436                                     &*LoopVectorBody->getFirstInsertionPt());
2437   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2438   Instruction *LastInduction = VecInd;
2439   for (unsigned Part = 0; Part < UF; ++Part) {
2440     State.set(Def, LastInduction, Part);
2441 
2442     if (isa<TruncInst>(EntryVal))
2443       addMetadata(LastInduction, EntryVal);
2444 
2445     LastInduction = cast<Instruction>(
2446         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2447     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2448   }
2449 
2450   // Move the last step to the end of the latch block. This ensures consistent
2451   // placement of all induction updates.
2452   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2453   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2454   LastInduction->moveBefore(Br);
2455   LastInduction->setName("vec.ind.next");
2456 
2457   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2458   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2459 }
2460 
2461 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2462 /// variable on which to base the steps, \p Step is the size of the step, and
2463 /// \p EntryVal is the value from the original loop that maps to the steps.
2464 /// Note that \p EntryVal doesn't have to be an induction variable - it
2465 /// can also be a truncate instruction.
2466 static void buildScalarSteps(Value *ScalarIV, Value *Step,
2467                              Instruction *EntryVal,
2468                              const InductionDescriptor &ID, VPValue *Def,
2469                              VPTransformState &State) {
2470   IRBuilderBase &Builder = State.Builder;
2471   // We shouldn't have to build scalar steps if we aren't vectorizing.
2472   assert(State.VF.isVector() && "VF should be greater than one");
2473   // Get the value type and ensure it and the step have the same integer type.
2474   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2475   assert(ScalarIVTy == Step->getType() &&
2476          "Val and Step should have the same type");
2477 
2478   // We build scalar steps for both integer and floating-point induction
2479   // variables. Here, we determine the kind of arithmetic we will perform.
2480   Instruction::BinaryOps AddOp;
2481   Instruction::BinaryOps MulOp;
2482   if (ScalarIVTy->isIntegerTy()) {
2483     AddOp = Instruction::Add;
2484     MulOp = Instruction::Mul;
2485   } else {
2486     AddOp = ID.getInductionOpcode();
2487     MulOp = Instruction::FMul;
2488   }
2489 
2490   // Determine the number of scalars we need to generate for each unroll
2491   // iteration.
2492   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2493   unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2494   // Compute the scalar steps and save the results in State.
2495   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2496                                      ScalarIVTy->getScalarSizeInBits());
2497   Type *VecIVTy = nullptr;
2498   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2499   if (!FirstLaneOnly && State.VF.isScalable()) {
2500     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2501     UnitStepVec =
2502         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2503     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2504     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2505   }
2506 
2507   for (unsigned Part = 0; Part < State.UF; ++Part) {
2508     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2509 
2510     if (!FirstLaneOnly && State.VF.isScalable()) {
2511       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2512       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2513       if (ScalarIVTy->isFloatingPointTy())
2514         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2515       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2516       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2517       State.set(Def, Add, Part);
2518       // It's useful to record the lane values too for the known minimum number
2519       // of elements so we do those below. This improves the code quality when
2520       // trying to extract the first element, for example.
2521     }
2522 
2523     if (ScalarIVTy->isFloatingPointTy())
2524       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2525 
2526     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2527       Value *StartIdx = Builder.CreateBinOp(
2528           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2529       // The step returned by `createStepForVF` is a runtime-evaluated value
2530       // when VF is scalable. Otherwise, it should be folded into a Constant.
2531       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2532              "Expected StartIdx to be folded to a constant when VF is not "
2533              "scalable");
2534       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2535       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2536       State.set(Def, Add, VPIteration(Part, Lane));
2537     }
2538   }
2539 }
2540 
2541 // Generate code for the induction step. Note that induction steps are
2542 // required to be loop-invariant
2543 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2544                               Instruction *InsertBefore,
2545                               Loop *OrigLoop = nullptr) {
2546   const DataLayout &DL = SE.getDataLayout();
2547   assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
2548          "Induction step should be loop invariant");
2549   if (auto *E = dyn_cast<SCEVUnknown>(Step))
2550     return E->getValue();
2551 
2552   SCEVExpander Exp(SE, DL, "induction");
2553   return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2554 }
2555 
2556 /// Compute the transformed value of Index at offset StartValue using step
2557 /// StepValue.
2558 /// For integer induction, returns StartValue + Index * StepValue.
2559 /// For pointer induction, returns StartValue[Index * StepValue].
2560 /// FIXME: The newly created binary instructions should contain nsw/nuw
2561 /// flags, which can be found from the original scalar operations.
2562 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *Step,
2563                                    const InductionDescriptor &ID) {
2564 
2565   auto StartValue = ID.getStartValue();
2566   assert(Index->getType()->getScalarType() == Step->getType() &&
2567          "Index scalar type does not match StepValue type");
2568 
2569   // Note: the IR at this point is broken. We cannot use SE to create any new
2570   // SCEV and then expand it, hoping that SCEV's simplification will give us
2571   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2572   // lead to various SCEV crashes. So all we can do is to use builder and rely
2573   // on InstCombine for future simplifications. Here we handle some trivial
2574   // cases only.
2575   auto CreateAdd = [&B](Value *X, Value *Y) {
2576     assert(X->getType() == Y->getType() && "Types don't match!");
2577     if (auto *CX = dyn_cast<ConstantInt>(X))
2578       if (CX->isZero())
2579         return Y;
2580     if (auto *CY = dyn_cast<ConstantInt>(Y))
2581       if (CY->isZero())
2582         return X;
2583     return B.CreateAdd(X, Y);
2584   };
2585 
2586   // We allow X to be a vector type, in which case Y will potentially be
2587   // splatted into a vector with the same element count.
2588   auto CreateMul = [&B](Value *X, Value *Y) {
2589     assert(X->getType()->getScalarType() == Y->getType() &&
2590            "Types don't match!");
2591     if (auto *CX = dyn_cast<ConstantInt>(X))
2592       if (CX->isOne())
2593         return Y;
2594     if (auto *CY = dyn_cast<ConstantInt>(Y))
2595       if (CY->isOne())
2596         return X;
2597     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2598     if (XVTy && !isa<VectorType>(Y->getType()))
2599       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2600     return B.CreateMul(X, Y);
2601   };
2602 
2603   switch (ID.getKind()) {
2604   case InductionDescriptor::IK_IntInduction: {
2605     assert(!isa<VectorType>(Index->getType()) &&
2606            "Vector indices not supported for integer inductions yet");
2607     assert(Index->getType() == StartValue->getType() &&
2608            "Index type does not match StartValue type");
2609     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2610       return B.CreateSub(StartValue, Index);
2611     auto *Offset = CreateMul(Index, Step);
2612     return CreateAdd(StartValue, Offset);
2613   }
2614   case InductionDescriptor::IK_PtrInduction: {
2615     assert(isa<Constant>(Step) &&
2616            "Expected constant step for pointer induction");
2617     return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2618   }
2619   case InductionDescriptor::IK_FpInduction: {
2620     assert(!isa<VectorType>(Index->getType()) &&
2621            "Vector indices not supported for FP inductions yet");
2622     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2623     auto InductionBinOp = ID.getInductionBinOp();
2624     assert(InductionBinOp &&
2625            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2626             InductionBinOp->getOpcode() == Instruction::FSub) &&
2627            "Original bin op should be defined for FP induction");
2628 
2629     Value *MulExp = B.CreateFMul(Step, Index);
2630     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2631                          "induction");
2632   }
2633   case InductionDescriptor::IK_NoInduction:
2634     return nullptr;
2635   }
2636   llvm_unreachable("invalid enum");
2637 }
2638 
2639 void InnerLoopVectorizer::widenIntOrFpInduction(
2640     PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State,
2641     Value *CanonicalIV) {
2642   Value *Start = Def->getStartValue()->getLiveInIRValue();
2643   const InductionDescriptor &ID = Def->getInductionDescriptor();
2644   TruncInst *Trunc = Def->getTruncInst();
2645   IRBuilderBase &Builder = State.Builder;
2646   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2647   assert(!State.VF.isZero() && "VF must be non-zero");
2648 
2649   // The value from the original loop to which we are mapping the new induction
2650   // variable.
2651   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2652 
2653   auto &DL = EntryVal->getModule()->getDataLayout();
2654 
2655   // Generate code for the induction step. Note that induction steps are
2656   // required to be loop-invariant
2657   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2658     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2659            "Induction step should be loop invariant");
2660     if (PSE.getSE()->isSCEVable(IV->getType())) {
2661       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2662       return Exp.expandCodeFor(Step, Step->getType(),
2663                                State.CFG.VectorPreHeader->getTerminator());
2664     }
2665     return cast<SCEVUnknown>(Step)->getValue();
2666   };
2667 
2668   // The scalar value to broadcast. This is derived from the canonical
2669   // induction variable. If a truncation type is given, truncate the canonical
2670   // induction variable and step. Otherwise, derive these values from the
2671   // induction descriptor.
2672   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2673     Value *ScalarIV = CanonicalIV;
2674     Type *NeededType = IV->getType();
2675     if (!Def->isCanonical() || ScalarIV->getType() != NeededType) {
2676       ScalarIV =
2677           NeededType->isIntegerTy()
2678               ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType)
2679               : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType);
2680       ScalarIV = emitTransformedIndex(Builder, ScalarIV, Step, ID);
2681       ScalarIV->setName("offset.idx");
2682     }
2683     if (Trunc) {
2684       auto *TruncType = cast<IntegerType>(Trunc->getType());
2685       assert(Step->getType()->isIntegerTy() &&
2686              "Truncation requires an integer step");
2687       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2688       Step = Builder.CreateTrunc(Step, TruncType);
2689     }
2690     return ScalarIV;
2691   };
2692 
2693   // Fast-math-flags propagate from the original induction instruction.
2694   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2695   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2696     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2697 
2698   // Now do the actual transformations, and start with creating the step value.
2699   Value *Step = CreateStepValue(ID.getStep());
2700   if (State.VF.isScalar()) {
2701     Value *ScalarIV = CreateScalarIV(Step);
2702     Type *ScalarTy = IntegerType::get(ScalarIV->getContext(),
2703                                       Step->getType()->getScalarSizeInBits());
2704 
2705     Instruction::BinaryOps IncOp = ID.getInductionOpcode();
2706     if (IncOp == Instruction::BinaryOpsEnd)
2707       IncOp = Instruction::Add;
2708     for (unsigned Part = 0; Part < UF; ++Part) {
2709       Value *StartIdx = ConstantInt::get(ScalarTy, Part);
2710       Instruction::BinaryOps MulOp = Instruction::Mul;
2711       if (Step->getType()->isFloatingPointTy()) {
2712         StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType());
2713         MulOp = Instruction::FMul;
2714       }
2715 
2716       Value *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2717       Value *EntryPart = Builder.CreateBinOp(IncOp, ScalarIV, Mul, "induction");
2718       State.set(Def, EntryPart, Part);
2719       if (Trunc) {
2720         assert(!Step->getType()->isFloatingPointTy() &&
2721                "fp inductions shouldn't be truncated");
2722         addMetadata(EntryPart, Trunc);
2723       }
2724     }
2725     return;
2726   }
2727 
2728   // Create a new independent vector induction variable, if one is needed.
2729   if (Def->needsVectorIV())
2730     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
2731 
2732   if (Def->needsScalarIV()) {
2733     // Create scalar steps that can be used by instructions we will later
2734     // scalarize. Note that the addition of the scalar steps will not increase
2735     // the number of instructions in the loop in the common case prior to
2736     // InstCombine. We will be trading one vector extract for each scalar step.
2737     Value *ScalarIV = CreateScalarIV(Step);
2738     buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
2739   }
2740 }
2741 
2742 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2743                                                     const VPIteration &Instance,
2744                                                     VPTransformState &State) {
2745   Value *ScalarInst = State.get(Def, Instance);
2746   Value *VectorValue = State.get(Def, Instance.Part);
2747   VectorValue = Builder.CreateInsertElement(
2748       VectorValue, ScalarInst,
2749       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2750   State.set(Def, VectorValue, Instance.Part);
2751 }
2752 
2753 // Return whether we allow using masked interleave-groups (for dealing with
2754 // strided loads/stores that reside in predicated blocks, or for dealing
2755 // with gaps).
2756 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2757   // If an override option has been passed in for interleaved accesses, use it.
2758   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2759     return EnableMaskedInterleavedMemAccesses;
2760 
2761   return TTI.enableMaskedInterleavedAccessVectorization();
2762 }
2763 
2764 // Try to vectorize the interleave group that \p Instr belongs to.
2765 //
2766 // E.g. Translate following interleaved load group (factor = 3):
2767 //   for (i = 0; i < N; i+=3) {
2768 //     R = Pic[i];             // Member of index 0
2769 //     G = Pic[i+1];           // Member of index 1
2770 //     B = Pic[i+2];           // Member of index 2
2771 //     ... // do something to R, G, B
2772 //   }
2773 // To:
2774 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2775 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2776 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2777 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2778 //
2779 // Or translate following interleaved store group (factor = 3):
2780 //   for (i = 0; i < N; i+=3) {
2781 //     ... do something to R, G, B
2782 //     Pic[i]   = R;           // Member of index 0
2783 //     Pic[i+1] = G;           // Member of index 1
2784 //     Pic[i+2] = B;           // Member of index 2
2785 //   }
2786 // To:
2787 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2788 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2789 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2790 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2791 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2792 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2793     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2794     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2795     VPValue *BlockInMask) {
2796   Instruction *Instr = Group->getInsertPos();
2797   const DataLayout &DL = Instr->getModule()->getDataLayout();
2798 
2799   // Prepare for the vector type of the interleaved load/store.
2800   Type *ScalarTy = getLoadStoreType(Instr);
2801   unsigned InterleaveFactor = Group->getFactor();
2802   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2803   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2804 
2805   // Prepare for the new pointers.
2806   SmallVector<Value *, 2> AddrParts;
2807   unsigned Index = Group->getIndex(Instr);
2808 
2809   // TODO: extend the masked interleaved-group support to reversed access.
2810   assert((!BlockInMask || !Group->isReverse()) &&
2811          "Reversed masked interleave-group not supported.");
2812 
2813   // If the group is reverse, adjust the index to refer to the last vector lane
2814   // instead of the first. We adjust the index from the first vector lane,
2815   // rather than directly getting the pointer for lane VF - 1, because the
2816   // pointer operand of the interleaved access is supposed to be uniform. For
2817   // uniform instructions, we're only required to generate a value for the
2818   // first vector lane in each unroll iteration.
2819   if (Group->isReverse())
2820     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2821 
2822   for (unsigned Part = 0; Part < UF; Part++) {
2823     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2824     setDebugLocFromInst(AddrPart);
2825 
2826     // Notice current instruction could be any index. Need to adjust the address
2827     // to the member of index 0.
2828     //
2829     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2830     //       b = A[i];       // Member of index 0
2831     // Current pointer is pointed to A[i+1], adjust it to A[i].
2832     //
2833     // E.g.  A[i+1] = a;     // Member of index 1
2834     //       A[i]   = b;     // Member of index 0
2835     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2836     // Current pointer is pointed to A[i+2], adjust it to A[i].
2837 
2838     bool InBounds = false;
2839     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2840       InBounds = gep->isInBounds();
2841     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2842     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2843 
2844     // Cast to the vector pointer type.
2845     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2846     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2847     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2848   }
2849 
2850   setDebugLocFromInst(Instr);
2851   Value *PoisonVec = PoisonValue::get(VecTy);
2852 
2853   Value *MaskForGaps = nullptr;
2854   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2855     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2856     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2857   }
2858 
2859   // Vectorize the interleaved load group.
2860   if (isa<LoadInst>(Instr)) {
2861     // For each unroll part, create a wide load for the group.
2862     SmallVector<Value *, 2> NewLoads;
2863     for (unsigned Part = 0; Part < UF; Part++) {
2864       Instruction *NewLoad;
2865       if (BlockInMask || MaskForGaps) {
2866         assert(useMaskedInterleavedAccesses(*TTI) &&
2867                "masked interleaved groups are not allowed.");
2868         Value *GroupMask = MaskForGaps;
2869         if (BlockInMask) {
2870           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2871           Value *ShuffledMask = Builder.CreateShuffleVector(
2872               BlockInMaskPart,
2873               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2874               "interleaved.mask");
2875           GroupMask = MaskForGaps
2876                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2877                                                 MaskForGaps)
2878                           : ShuffledMask;
2879         }
2880         NewLoad =
2881             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2882                                      GroupMask, PoisonVec, "wide.masked.vec");
2883       }
2884       else
2885         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2886                                             Group->getAlign(), "wide.vec");
2887       Group->addMetadata(NewLoad);
2888       NewLoads.push_back(NewLoad);
2889     }
2890 
2891     // For each member in the group, shuffle out the appropriate data from the
2892     // wide loads.
2893     unsigned J = 0;
2894     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2895       Instruction *Member = Group->getMember(I);
2896 
2897       // Skip the gaps in the group.
2898       if (!Member)
2899         continue;
2900 
2901       auto StrideMask =
2902           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2903       for (unsigned Part = 0; Part < UF; Part++) {
2904         Value *StridedVec = Builder.CreateShuffleVector(
2905             NewLoads[Part], StrideMask, "strided.vec");
2906 
2907         // If this member has different type, cast the result type.
2908         if (Member->getType() != ScalarTy) {
2909           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2910           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2911           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2912         }
2913 
2914         if (Group->isReverse())
2915           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2916 
2917         State.set(VPDefs[J], StridedVec, Part);
2918       }
2919       ++J;
2920     }
2921     return;
2922   }
2923 
2924   // The sub vector type for current instruction.
2925   auto *SubVT = VectorType::get(ScalarTy, VF);
2926 
2927   // Vectorize the interleaved store group.
2928   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2929   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2930          "masked interleaved groups are not allowed.");
2931   assert((!MaskForGaps || !VF.isScalable()) &&
2932          "masking gaps for scalable vectors is not yet supported.");
2933   for (unsigned Part = 0; Part < UF; Part++) {
2934     // Collect the stored vector from each member.
2935     SmallVector<Value *, 4> StoredVecs;
2936     for (unsigned i = 0; i < InterleaveFactor; i++) {
2937       assert((Group->getMember(i) || MaskForGaps) &&
2938              "Fail to get a member from an interleaved store group");
2939       Instruction *Member = Group->getMember(i);
2940 
2941       // Skip the gaps in the group.
2942       if (!Member) {
2943         Value *Undef = PoisonValue::get(SubVT);
2944         StoredVecs.push_back(Undef);
2945         continue;
2946       }
2947 
2948       Value *StoredVec = State.get(StoredValues[i], Part);
2949 
2950       if (Group->isReverse())
2951         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2952 
2953       // If this member has different type, cast it to a unified type.
2954 
2955       if (StoredVec->getType() != SubVT)
2956         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2957 
2958       StoredVecs.push_back(StoredVec);
2959     }
2960 
2961     // Concatenate all vectors into a wide vector.
2962     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2963 
2964     // Interleave the elements in the wide vector.
2965     Value *IVec = Builder.CreateShuffleVector(
2966         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2967         "interleaved.vec");
2968 
2969     Instruction *NewStoreInstr;
2970     if (BlockInMask || MaskForGaps) {
2971       Value *GroupMask = MaskForGaps;
2972       if (BlockInMask) {
2973         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2974         Value *ShuffledMask = Builder.CreateShuffleVector(
2975             BlockInMaskPart,
2976             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2977             "interleaved.mask");
2978         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2979                                                       ShuffledMask, MaskForGaps)
2980                                 : ShuffledMask;
2981       }
2982       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2983                                                 Group->getAlign(), GroupMask);
2984     } else
2985       NewStoreInstr =
2986           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2987 
2988     Group->addMetadata(NewStoreInstr);
2989   }
2990 }
2991 
2992 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2993                                                VPReplicateRecipe *RepRecipe,
2994                                                const VPIteration &Instance,
2995                                                bool IfPredicateInstr,
2996                                                VPTransformState &State) {
2997   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2998 
2999   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
3000   // the first lane and part.
3001   if (isa<NoAliasScopeDeclInst>(Instr))
3002     if (!Instance.isFirstIteration())
3003       return;
3004 
3005   setDebugLocFromInst(Instr);
3006 
3007   // Does this instruction return a value ?
3008   bool IsVoidRetTy = Instr->getType()->isVoidTy();
3009 
3010   Instruction *Cloned = Instr->clone();
3011   if (!IsVoidRetTy)
3012     Cloned->setName(Instr->getName() + ".cloned");
3013 
3014   // If the scalarized instruction contributes to the address computation of a
3015   // widen masked load/store which was in a basic block that needed predication
3016   // and is not predicated after vectorization, we can't propagate
3017   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
3018   // instruction could feed a poison value to the base address of the widen
3019   // load/store.
3020   if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
3021     Cloned->dropPoisonGeneratingFlags();
3022 
3023   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
3024                                Builder.GetInsertPoint());
3025   // Replace the operands of the cloned instructions with their scalar
3026   // equivalents in the new loop.
3027   for (auto &I : enumerate(RepRecipe->operands())) {
3028     auto InputInstance = Instance;
3029     VPValue *Operand = I.value();
3030     VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand);
3031     if (OperandR && OperandR->isUniform())
3032       InputInstance.Lane = VPLane::getFirstLane();
3033     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
3034   }
3035   addNewMetadata(Cloned, Instr);
3036 
3037   // Place the cloned scalar in the new loop.
3038   Builder.Insert(Cloned);
3039 
3040   State.set(RepRecipe, Cloned, Instance);
3041 
3042   // If we just cloned a new assumption, add it the assumption cache.
3043   if (auto *II = dyn_cast<AssumeInst>(Cloned))
3044     AC->registerAssumption(II);
3045 
3046   // End if-block.
3047   if (IfPredicateInstr)
3048     PredicatedInstructions.push_back(Cloned);
3049 }
3050 
3051 void InnerLoopVectorizer::createHeaderBranch(Loop *L) {
3052   BasicBlock *Header = L->getHeader();
3053   assert(!L->getLoopLatch() && "loop should not have a latch at this point");
3054 
3055   IRBuilder<> B(Header->getTerminator());
3056   Instruction *OldInst =
3057       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
3058   setDebugLocFromInst(OldInst, &B);
3059 
3060   // Connect the header to the exit and header blocks and replace the old
3061   // terminator.
3062   B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header);
3063 
3064   // Now we have two terminators. Remove the old one from the block.
3065   Header->getTerminator()->eraseFromParent();
3066 }
3067 
3068 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3069   if (TripCount)
3070     return TripCount;
3071 
3072   assert(L && "Create Trip Count for null loop.");
3073   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3074   // Find the loop boundaries.
3075   ScalarEvolution *SE = PSE.getSE();
3076   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3077   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
3078          "Invalid loop count");
3079 
3080   Type *IdxTy = Legal->getWidestInductionType();
3081   assert(IdxTy && "No type for induction");
3082 
3083   // The exit count might have the type of i64 while the phi is i32. This can
3084   // happen if we have an induction variable that is sign extended before the
3085   // compare. The only way that we get a backedge taken count is that the
3086   // induction variable was signed and as such will not overflow. In such a case
3087   // truncation is legal.
3088   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3089       IdxTy->getPrimitiveSizeInBits())
3090     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3091   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3092 
3093   // Get the total trip count from the count by adding 1.
3094   const SCEV *ExitCount = SE->getAddExpr(
3095       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3096 
3097   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3098 
3099   // Expand the trip count and place the new instructions in the preheader.
3100   // Notice that the pre-header does not change, only the loop body.
3101   SCEVExpander Exp(*SE, DL, "induction");
3102 
3103   // Count holds the overall loop count (N).
3104   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3105                                 L->getLoopPreheader()->getTerminator());
3106 
3107   if (TripCount->getType()->isPointerTy())
3108     TripCount =
3109         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3110                                     L->getLoopPreheader()->getTerminator());
3111 
3112   return TripCount;
3113 }
3114 
3115 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3116   if (VectorTripCount)
3117     return VectorTripCount;
3118 
3119   Value *TC = getOrCreateTripCount(L);
3120   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3121 
3122   Type *Ty = TC->getType();
3123   // This is where we can make the step a runtime constant.
3124   Value *Step = createStepForVF(Builder, Ty, VF, UF);
3125 
3126   // If the tail is to be folded by masking, round the number of iterations N
3127   // up to a multiple of Step instead of rounding down. This is done by first
3128   // adding Step-1 and then rounding down. Note that it's ok if this addition
3129   // overflows: the vector induction variable will eventually wrap to zero given
3130   // that it starts at zero and its Step is a power of two; the loop will then
3131   // exit, with the last early-exit vector comparison also producing all-true.
3132   if (Cost->foldTailByMasking()) {
3133     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
3134            "VF*UF must be a power of 2 when folding tail by masking");
3135     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
3136     TC = Builder.CreateAdd(
3137         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
3138   }
3139 
3140   // Now we need to generate the expression for the part of the loop that the
3141   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3142   // iterations are not required for correctness, or N - Step, otherwise. Step
3143   // is equal to the vectorization factor (number of SIMD elements) times the
3144   // unroll factor (number of SIMD instructions).
3145   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3146 
3147   // There are cases where we *must* run at least one iteration in the remainder
3148   // loop.  See the cost model for when this can happen.  If the step evenly
3149   // divides the trip count, we set the remainder to be equal to the step. If
3150   // the step does not evenly divide the trip count, no adjustment is necessary
3151   // since there will already be scalar iterations. Note that the minimum
3152   // iterations check ensures that N >= Step.
3153   if (Cost->requiresScalarEpilogue(VF)) {
3154     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3155     R = Builder.CreateSelect(IsZero, Step, R);
3156   }
3157 
3158   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3159 
3160   return VectorTripCount;
3161 }
3162 
3163 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3164                                                    const DataLayout &DL) {
3165   // Verify that V is a vector type with same number of elements as DstVTy.
3166   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3167   unsigned VF = DstFVTy->getNumElements();
3168   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3169   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3170   Type *SrcElemTy = SrcVecTy->getElementType();
3171   Type *DstElemTy = DstFVTy->getElementType();
3172   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3173          "Vector elements must have same size");
3174 
3175   // Do a direct cast if element types are castable.
3176   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3177     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3178   }
3179   // V cannot be directly casted to desired vector type.
3180   // May happen when V is a floating point vector but DstVTy is a vector of
3181   // pointers or vice-versa. Handle this using a two-step bitcast using an
3182   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3183   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3184          "Only one type should be a pointer type");
3185   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3186          "Only one type should be a floating point type");
3187   Type *IntTy =
3188       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3189   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3190   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3191   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3192 }
3193 
3194 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3195                                                          BasicBlock *Bypass) {
3196   Value *Count = getOrCreateTripCount(L);
3197   // Reuse existing vector loop preheader for TC checks.
3198   // Note that new preheader block is generated for vector loop.
3199   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3200   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3201 
3202   // Generate code to check if the loop's trip count is less than VF * UF, or
3203   // equal to it in case a scalar epilogue is required; this implies that the
3204   // vector trip count is zero. This check also covers the case where adding one
3205   // to the backedge-taken count overflowed leading to an incorrect trip count
3206   // of zero. In this case we will also jump to the scalar loop.
3207   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
3208                                             : ICmpInst::ICMP_ULT;
3209 
3210   // If tail is to be folded, vector loop takes care of all iterations.
3211   Value *CheckMinIters = Builder.getFalse();
3212   if (!Cost->foldTailByMasking()) {
3213     Value *Step = createStepForVF(Builder, Count->getType(), VF, UF);
3214     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3215   }
3216   // Create new preheader for vector loop.
3217   LoopVectorPreHeader =
3218       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3219                  "vector.ph");
3220 
3221   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3222                                DT->getNode(Bypass)->getIDom()) &&
3223          "TC check is expected to dominate Bypass");
3224 
3225   // Update dominator for Bypass & LoopExit (if needed).
3226   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3227   if (!Cost->requiresScalarEpilogue(VF))
3228     // If there is an epilogue which must run, there's no edge from the
3229     // middle block to exit blocks  and thus no need to update the immediate
3230     // dominator of the exit blocks.
3231     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3232 
3233   ReplaceInstWithInst(
3234       TCCheckBlock->getTerminator(),
3235       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3236   LoopBypassBlocks.push_back(TCCheckBlock);
3237 }
3238 
3239 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3240 
3241   BasicBlock *const SCEVCheckBlock =
3242       RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3243   if (!SCEVCheckBlock)
3244     return nullptr;
3245 
3246   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3247            (OptForSizeBasedOnProfile &&
3248             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3249          "Cannot SCEV check stride or overflow when optimizing for size");
3250 
3251 
3252   // Update dominator only if this is first RT check.
3253   if (LoopBypassBlocks.empty()) {
3254     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3255     if (!Cost->requiresScalarEpilogue(VF))
3256       // If there is an epilogue which must run, there's no edge from the
3257       // middle block to exit blocks  and thus no need to update the immediate
3258       // dominator of the exit blocks.
3259       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3260   }
3261 
3262   LoopBypassBlocks.push_back(SCEVCheckBlock);
3263   AddedSafetyChecks = true;
3264   return SCEVCheckBlock;
3265 }
3266 
3267 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3268                                                       BasicBlock *Bypass) {
3269   // VPlan-native path does not do any analysis for runtime checks currently.
3270   if (EnableVPlanNativePath)
3271     return nullptr;
3272 
3273   BasicBlock *const MemCheckBlock =
3274       RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3275 
3276   // Check if we generated code that checks in runtime if arrays overlap. We put
3277   // the checks into a separate block to make the more common case of few
3278   // elements faster.
3279   if (!MemCheckBlock)
3280     return nullptr;
3281 
3282   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3283     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3284            "Cannot emit memory checks when optimizing for size, unless forced "
3285            "to vectorize.");
3286     ORE->emit([&]() {
3287       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3288                                         L->getStartLoc(), L->getHeader())
3289              << "Code-size may be reduced by not forcing "
3290                 "vectorization, or by source-code modifications "
3291                 "eliminating the need for runtime checks "
3292                 "(e.g., adding 'restrict').";
3293     });
3294   }
3295 
3296   LoopBypassBlocks.push_back(MemCheckBlock);
3297 
3298   AddedSafetyChecks = true;
3299 
3300   // We currently don't use LoopVersioning for the actual loop cloning but we
3301   // still use it to add the noalias metadata.
3302   LVer = std::make_unique<LoopVersioning>(
3303       *Legal->getLAI(),
3304       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3305       DT, PSE.getSE());
3306   LVer->prepareNoAliasMetadata();
3307   return MemCheckBlock;
3308 }
3309 
3310 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3311   LoopScalarBody = OrigLoop->getHeader();
3312   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3313   assert(LoopVectorPreHeader && "Invalid loop structure");
3314   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3315   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3316          "multiple exit loop without required epilogue?");
3317 
3318   LoopMiddleBlock =
3319       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3320                  LI, nullptr, Twine(Prefix) + "middle.block");
3321   LoopScalarPreHeader =
3322       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3323                  nullptr, Twine(Prefix) + "scalar.ph");
3324 
3325   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3326 
3327   // Set up the middle block terminator.  Two cases:
3328   // 1) If we know that we must execute the scalar epilogue, emit an
3329   //    unconditional branch.
3330   // 2) Otherwise, we must have a single unique exit block (due to how we
3331   //    implement the multiple exit case).  In this case, set up a conditonal
3332   //    branch from the middle block to the loop scalar preheader, and the
3333   //    exit block.  completeLoopSkeleton will update the condition to use an
3334   //    iteration check, if required to decide whether to execute the remainder.
3335   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3336     BranchInst::Create(LoopScalarPreHeader) :
3337     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3338                        Builder.getTrue());
3339   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3340   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3341 
3342   // We intentionally don't let SplitBlock to update LoopInfo since
3343   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3344   // LoopVectorBody is explicitly added to the correct place few lines later.
3345   LoopVectorBody =
3346       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3347                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3348 
3349   // Update dominator for loop exit.
3350   if (!Cost->requiresScalarEpilogue(VF))
3351     // If there is an epilogue which must run, there's no edge from the
3352     // middle block to exit blocks  and thus no need to update the immediate
3353     // dominator of the exit blocks.
3354     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3355 
3356   // Create and register the new vector loop.
3357   Loop *Lp = LI->AllocateLoop();
3358   Loop *ParentLoop = OrigLoop->getParentLoop();
3359 
3360   // Insert the new loop into the loop nest and register the new basic blocks
3361   // before calling any utilities such as SCEV that require valid LoopInfo.
3362   if (ParentLoop) {
3363     ParentLoop->addChildLoop(Lp);
3364   } else {
3365     LI->addTopLevelLoop(Lp);
3366   }
3367   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3368   return Lp;
3369 }
3370 
3371 void InnerLoopVectorizer::createInductionResumeValues(
3372     Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) {
3373   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3374           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3375          "Inconsistent information about additional bypass.");
3376 
3377   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3378   assert(VectorTripCount && L && "Expected valid arguments");
3379   // We are going to resume the execution of the scalar loop.
3380   // Go over all of the induction variables that we found and fix the
3381   // PHIs that are left in the scalar version of the loop.
3382   // The starting values of PHI nodes depend on the counter of the last
3383   // iteration in the vectorized loop.
3384   // If we come from a bypass edge then we need to start from the original
3385   // start value.
3386   Instruction *OldInduction = Legal->getPrimaryInduction();
3387   for (auto &InductionEntry : Legal->getInductionVars()) {
3388     PHINode *OrigPhi = InductionEntry.first;
3389     InductionDescriptor II = InductionEntry.second;
3390 
3391     // Create phi nodes to merge from the  backedge-taken check block.
3392     PHINode *BCResumeVal =
3393         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3394                         LoopScalarPreHeader->getTerminator());
3395     // Copy original phi DL over to the new one.
3396     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3397     Value *&EndValue = IVEndValues[OrigPhi];
3398     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3399     if (OrigPhi == OldInduction) {
3400       // We know what the end value is.
3401       EndValue = VectorTripCount;
3402     } else {
3403       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3404 
3405       // Fast-math-flags propagate from the original induction instruction.
3406       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3407         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3408 
3409       Type *StepType = II.getStep()->getType();
3410       Instruction::CastOps CastOp =
3411           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3412       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3413       Value *Step =
3414           CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3415       EndValue = emitTransformedIndex(B, CRD, Step, II);
3416       EndValue->setName("ind.end");
3417 
3418       // Compute the end value for the additional bypass (if applicable).
3419       if (AdditionalBypass.first) {
3420         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3421         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3422                                          StepType, true);
3423         Value *Step =
3424             CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3425         CRD =
3426             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3427         EndValueFromAdditionalBypass = emitTransformedIndex(B, CRD, Step, II);
3428         EndValueFromAdditionalBypass->setName("ind.end");
3429       }
3430     }
3431     // The new PHI merges the original incoming value, in case of a bypass,
3432     // or the value at the end of the vectorized loop.
3433     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3434 
3435     // Fix the scalar body counter (PHI node).
3436     // The old induction's phi node in the scalar body needs the truncated
3437     // value.
3438     for (BasicBlock *BB : LoopBypassBlocks)
3439       BCResumeVal->addIncoming(II.getStartValue(), BB);
3440 
3441     if (AdditionalBypass.first)
3442       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3443                                             EndValueFromAdditionalBypass);
3444 
3445     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3446   }
3447 }
3448 
3449 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3450                                                       MDNode *OrigLoopID) {
3451   assert(L && "Expected valid loop.");
3452 
3453   // The trip counts should be cached by now.
3454   Value *Count = getOrCreateTripCount(L);
3455   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3456 
3457   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3458 
3459   // Add a check in the middle block to see if we have completed
3460   // all of the iterations in the first vector loop.  Three cases:
3461   // 1) If we require a scalar epilogue, there is no conditional branch as
3462   //    we unconditionally branch to the scalar preheader.  Do nothing.
3463   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3464   //    Thus if tail is to be folded, we know we don't need to run the
3465   //    remainder and we can use the previous value for the condition (true).
3466   // 3) Otherwise, construct a runtime check.
3467   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3468     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3469                                         Count, VectorTripCount, "cmp.n",
3470                                         LoopMiddleBlock->getTerminator());
3471 
3472     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3473     // of the corresponding compare because they may have ended up with
3474     // different line numbers and we want to avoid awkward line stepping while
3475     // debugging. Eg. if the compare has got a line number inside the loop.
3476     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3477     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3478   }
3479 
3480   // Get ready to start creating new instructions into the vectorized body.
3481   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3482          "Inconsistent vector loop preheader");
3483   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3484 
3485 #ifdef EXPENSIVE_CHECKS
3486   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3487   LI->verify(*DT);
3488 #endif
3489 
3490   return LoopVectorPreHeader;
3491 }
3492 
3493 std::pair<BasicBlock *, Value *>
3494 InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3495   /*
3496    In this function we generate a new loop. The new loop will contain
3497    the vectorized instructions while the old loop will continue to run the
3498    scalar remainder.
3499 
3500        [ ] <-- loop iteration number check.
3501     /   |
3502    /    v
3503   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3504   |  /  |
3505   | /   v
3506   ||   [ ]     <-- vector pre header.
3507   |/    |
3508   |     v
3509   |    [  ] \
3510   |    [  ]_|   <-- vector loop.
3511   |     |
3512   |     v
3513   \   -[ ]   <--- middle-block.
3514    \/   |
3515    /\   v
3516    | ->[ ]     <--- new preheader.
3517    |    |
3518  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3519    |   [ ] \
3520    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3521     \   |
3522      \  v
3523       >[ ]     <-- exit block(s).
3524    ...
3525    */
3526 
3527   // Get the metadata of the original loop before it gets modified.
3528   MDNode *OrigLoopID = OrigLoop->getLoopID();
3529 
3530   // Workaround!  Compute the trip count of the original loop and cache it
3531   // before we start modifying the CFG.  This code has a systemic problem
3532   // wherein it tries to run analysis over partially constructed IR; this is
3533   // wrong, and not simply for SCEV.  The trip count of the original loop
3534   // simply happens to be prone to hitting this in practice.  In theory, we
3535   // can hit the same issue for any SCEV, or ValueTracking query done during
3536   // mutation.  See PR49900.
3537   getOrCreateTripCount(OrigLoop);
3538 
3539   // Create an empty vector loop, and prepare basic blocks for the runtime
3540   // checks.
3541   Loop *Lp = createVectorLoopSkeleton("");
3542 
3543   // Now, compare the new count to zero. If it is zero skip the vector loop and
3544   // jump to the scalar loop. This check also covers the case where the
3545   // backedge-taken count is uint##_max: adding one to it will overflow leading
3546   // to an incorrect trip count of zero. In this (rare) case we will also jump
3547   // to the scalar loop.
3548   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3549 
3550   // Generate the code to check any assumptions that we've made for SCEV
3551   // expressions.
3552   emitSCEVChecks(Lp, LoopScalarPreHeader);
3553 
3554   // Generate the code that checks in runtime if arrays overlap. We put the
3555   // checks into a separate block to make the more common case of few elements
3556   // faster.
3557   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3558 
3559   createHeaderBranch(Lp);
3560 
3561   // Emit phis for the new starting index of the scalar loop.
3562   createInductionResumeValues(Lp);
3563 
3564   return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
3565 }
3566 
3567 // Fix up external users of the induction variable. At this point, we are
3568 // in LCSSA form, with all external PHIs that use the IV having one input value,
3569 // coming from the remainder loop. We need those PHIs to also have a correct
3570 // value for the IV when arriving directly from the middle block.
3571 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3572                                        const InductionDescriptor &II,
3573                                        Value *CountRoundDown, Value *EndValue,
3574                                        BasicBlock *MiddleBlock) {
3575   // There are two kinds of external IV usages - those that use the value
3576   // computed in the last iteration (the PHI) and those that use the penultimate
3577   // value (the value that feeds into the phi from the loop latch).
3578   // We allow both, but they, obviously, have different values.
3579 
3580   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3581 
3582   DenseMap<Value *, Value *> MissingVals;
3583 
3584   // An external user of the last iteration's value should see the value that
3585   // the remainder loop uses to initialize its own IV.
3586   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3587   for (User *U : PostInc->users()) {
3588     Instruction *UI = cast<Instruction>(U);
3589     if (!OrigLoop->contains(UI)) {
3590       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3591       MissingVals[UI] = EndValue;
3592     }
3593   }
3594 
3595   // An external user of the penultimate value need to see EndValue - Step.
3596   // The simplest way to get this is to recompute it from the constituent SCEVs,
3597   // that is Start + (Step * (CRD - 1)).
3598   for (User *U : OrigPhi->users()) {
3599     auto *UI = cast<Instruction>(U);
3600     if (!OrigLoop->contains(UI)) {
3601       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3602 
3603       IRBuilder<> B(MiddleBlock->getTerminator());
3604 
3605       // Fast-math-flags propagate from the original induction instruction.
3606       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3607         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3608 
3609       Value *CountMinusOne = B.CreateSub(
3610           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3611       Value *CMO =
3612           !II.getStep()->getType()->isIntegerTy()
3613               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3614                              II.getStep()->getType())
3615               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3616       CMO->setName("cast.cmo");
3617 
3618       Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3619                                     LoopVectorBody->getTerminator());
3620       Value *Escape = emitTransformedIndex(B, CMO, Step, II);
3621       Escape->setName("ind.escape");
3622       MissingVals[UI] = Escape;
3623     }
3624   }
3625 
3626   for (auto &I : MissingVals) {
3627     PHINode *PHI = cast<PHINode>(I.first);
3628     // One corner case we have to handle is two IVs "chasing" each-other,
3629     // that is %IV2 = phi [...], [ %IV1, %latch ]
3630     // In this case, if IV1 has an external use, we need to avoid adding both
3631     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3632     // don't already have an incoming value for the middle block.
3633     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3634       PHI->addIncoming(I.second, MiddleBlock);
3635   }
3636 }
3637 
3638 namespace {
3639 
3640 struct CSEDenseMapInfo {
3641   static bool canHandle(const Instruction *I) {
3642     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3643            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3644   }
3645 
3646   static inline Instruction *getEmptyKey() {
3647     return DenseMapInfo<Instruction *>::getEmptyKey();
3648   }
3649 
3650   static inline Instruction *getTombstoneKey() {
3651     return DenseMapInfo<Instruction *>::getTombstoneKey();
3652   }
3653 
3654   static unsigned getHashValue(const Instruction *I) {
3655     assert(canHandle(I) && "Unknown instruction!");
3656     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3657                                                            I->value_op_end()));
3658   }
3659 
3660   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3661     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3662         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3663       return LHS == RHS;
3664     return LHS->isIdenticalTo(RHS);
3665   }
3666 };
3667 
3668 } // end anonymous namespace
3669 
3670 ///Perform cse of induction variable instructions.
3671 static void cse(BasicBlock *BB) {
3672   // Perform simple cse.
3673   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3674   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3675     if (!CSEDenseMapInfo::canHandle(&In))
3676       continue;
3677 
3678     // Check if we can replace this instruction with any of the
3679     // visited instructions.
3680     if (Instruction *V = CSEMap.lookup(&In)) {
3681       In.replaceAllUsesWith(V);
3682       In.eraseFromParent();
3683       continue;
3684     }
3685 
3686     CSEMap[&In] = &In;
3687   }
3688 }
3689 
3690 InstructionCost
3691 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3692                                               bool &NeedToScalarize) const {
3693   Function *F = CI->getCalledFunction();
3694   Type *ScalarRetTy = CI->getType();
3695   SmallVector<Type *, 4> Tys, ScalarTys;
3696   for (auto &ArgOp : CI->args())
3697     ScalarTys.push_back(ArgOp->getType());
3698 
3699   // Estimate cost of scalarized vector call. The source operands are assumed
3700   // to be vectors, so we need to extract individual elements from there,
3701   // execute VF scalar calls, and then gather the result into the vector return
3702   // value.
3703   InstructionCost ScalarCallCost =
3704       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3705   if (VF.isScalar())
3706     return ScalarCallCost;
3707 
3708   // Compute corresponding vector type for return value and arguments.
3709   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3710   for (Type *ScalarTy : ScalarTys)
3711     Tys.push_back(ToVectorTy(ScalarTy, VF));
3712 
3713   // Compute costs of unpacking argument values for the scalar calls and
3714   // packing the return values to a vector.
3715   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3716 
3717   InstructionCost Cost =
3718       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3719 
3720   // If we can't emit a vector call for this function, then the currently found
3721   // cost is the cost we need to return.
3722   NeedToScalarize = true;
3723   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3724   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3725 
3726   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3727     return Cost;
3728 
3729   // If the corresponding vector cost is cheaper, return its cost.
3730   InstructionCost VectorCallCost =
3731       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3732   if (VectorCallCost < Cost) {
3733     NeedToScalarize = false;
3734     Cost = VectorCallCost;
3735   }
3736   return Cost;
3737 }
3738 
3739 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3740   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3741     return Elt;
3742   return VectorType::get(Elt, VF);
3743 }
3744 
3745 InstructionCost
3746 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3747                                                    ElementCount VF) const {
3748   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3749   assert(ID && "Expected intrinsic call!");
3750   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3751   FastMathFlags FMF;
3752   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3753     FMF = FPMO->getFastMathFlags();
3754 
3755   SmallVector<const Value *> Arguments(CI->args());
3756   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3757   SmallVector<Type *> ParamTys;
3758   std::transform(FTy->param_begin(), FTy->param_end(),
3759                  std::back_inserter(ParamTys),
3760                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3761 
3762   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3763                                     dyn_cast<IntrinsicInst>(CI));
3764   return TTI.getIntrinsicInstrCost(CostAttrs,
3765                                    TargetTransformInfo::TCK_RecipThroughput);
3766 }
3767 
3768 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3769   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3770   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3771   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3772 }
3773 
3774 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3775   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3776   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3777   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3778 }
3779 
3780 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3781   // For every instruction `I` in MinBWs, truncate the operands, create a
3782   // truncated version of `I` and reextend its result. InstCombine runs
3783   // later and will remove any ext/trunc pairs.
3784   SmallPtrSet<Value *, 4> Erased;
3785   for (const auto &KV : Cost->getMinimalBitwidths()) {
3786     // If the value wasn't vectorized, we must maintain the original scalar
3787     // type. The absence of the value from State indicates that it
3788     // wasn't vectorized.
3789     // FIXME: Should not rely on getVPValue at this point.
3790     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3791     if (!State.hasAnyVectorValue(Def))
3792       continue;
3793     for (unsigned Part = 0; Part < UF; ++Part) {
3794       Value *I = State.get(Def, Part);
3795       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3796         continue;
3797       Type *OriginalTy = I->getType();
3798       Type *ScalarTruncatedTy =
3799           IntegerType::get(OriginalTy->getContext(), KV.second);
3800       auto *TruncatedTy = VectorType::get(
3801           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3802       if (TruncatedTy == OriginalTy)
3803         continue;
3804 
3805       IRBuilder<> B(cast<Instruction>(I));
3806       auto ShrinkOperand = [&](Value *V) -> Value * {
3807         if (auto *ZI = dyn_cast<ZExtInst>(V))
3808           if (ZI->getSrcTy() == TruncatedTy)
3809             return ZI->getOperand(0);
3810         return B.CreateZExtOrTrunc(V, TruncatedTy);
3811       };
3812 
3813       // The actual instruction modification depends on the instruction type,
3814       // unfortunately.
3815       Value *NewI = nullptr;
3816       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3817         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3818                              ShrinkOperand(BO->getOperand(1)));
3819 
3820         // Any wrapping introduced by shrinking this operation shouldn't be
3821         // considered undefined behavior. So, we can't unconditionally copy
3822         // arithmetic wrapping flags to NewI.
3823         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3824       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3825         NewI =
3826             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3827                          ShrinkOperand(CI->getOperand(1)));
3828       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3829         NewI = B.CreateSelect(SI->getCondition(),
3830                               ShrinkOperand(SI->getTrueValue()),
3831                               ShrinkOperand(SI->getFalseValue()));
3832       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3833         switch (CI->getOpcode()) {
3834         default:
3835           llvm_unreachable("Unhandled cast!");
3836         case Instruction::Trunc:
3837           NewI = ShrinkOperand(CI->getOperand(0));
3838           break;
3839         case Instruction::SExt:
3840           NewI = B.CreateSExtOrTrunc(
3841               CI->getOperand(0),
3842               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3843           break;
3844         case Instruction::ZExt:
3845           NewI = B.CreateZExtOrTrunc(
3846               CI->getOperand(0),
3847               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3848           break;
3849         }
3850       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3851         auto Elements0 =
3852             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3853         auto *O0 = B.CreateZExtOrTrunc(
3854             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3855         auto Elements1 =
3856             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3857         auto *O1 = B.CreateZExtOrTrunc(
3858             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3859 
3860         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3861       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3862         // Don't do anything with the operands, just extend the result.
3863         continue;
3864       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3865         auto Elements =
3866             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3867         auto *O0 = B.CreateZExtOrTrunc(
3868             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3869         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3870         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3871       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3872         auto Elements =
3873             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3874         auto *O0 = B.CreateZExtOrTrunc(
3875             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3876         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3877       } else {
3878         // If we don't know what to do, be conservative and don't do anything.
3879         continue;
3880       }
3881 
3882       // Lastly, extend the result.
3883       NewI->takeName(cast<Instruction>(I));
3884       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3885       I->replaceAllUsesWith(Res);
3886       cast<Instruction>(I)->eraseFromParent();
3887       Erased.insert(I);
3888       State.reset(Def, Res, Part);
3889     }
3890   }
3891 
3892   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3893   for (const auto &KV : Cost->getMinimalBitwidths()) {
3894     // If the value wasn't vectorized, we must maintain the original scalar
3895     // type. The absence of the value from State indicates that it
3896     // wasn't vectorized.
3897     // FIXME: Should not rely on getVPValue at this point.
3898     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3899     if (!State.hasAnyVectorValue(Def))
3900       continue;
3901     for (unsigned Part = 0; Part < UF; ++Part) {
3902       Value *I = State.get(Def, Part);
3903       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3904       if (Inst && Inst->use_empty()) {
3905         Value *NewI = Inst->getOperand(0);
3906         Inst->eraseFromParent();
3907         State.reset(Def, NewI, Part);
3908       }
3909     }
3910   }
3911 }
3912 
3913 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
3914   // Insert truncates and extends for any truncated instructions as hints to
3915   // InstCombine.
3916   if (VF.isVector())
3917     truncateToMinimalBitwidths(State);
3918 
3919   // Fix widened non-induction PHIs by setting up the PHI operands.
3920   if (OrigPHIsToFix.size()) {
3921     assert(EnableVPlanNativePath &&
3922            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3923     fixNonInductionPHIs(State);
3924   }
3925 
3926   // At this point every instruction in the original loop is widened to a
3927   // vector form. Now we need to fix the recurrences in the loop. These PHI
3928   // nodes are currently empty because we did not want to introduce cycles.
3929   // This is the second stage of vectorizing recurrences.
3930   fixCrossIterationPHIs(State);
3931 
3932   // Forget the original basic block.
3933   PSE.getSE()->forgetLoop(OrigLoop);
3934 
3935   // If we inserted an edge from the middle block to the unique exit block,
3936   // update uses outside the loop (phis) to account for the newly inserted
3937   // edge.
3938   if (!Cost->requiresScalarEpilogue(VF)) {
3939     // Fix-up external users of the induction variables.
3940     for (auto &Entry : Legal->getInductionVars())
3941       fixupIVUsers(Entry.first, Entry.second,
3942                    getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3943                    IVEndValues[Entry.first], LoopMiddleBlock);
3944 
3945     fixLCSSAPHIs(State);
3946   }
3947 
3948   for (Instruction *PI : PredicatedInstructions)
3949     sinkScalarOperands(&*PI);
3950 
3951   // Remove redundant induction instructions.
3952   cse(LoopVectorBody);
3953 
3954   // Set/update profile weights for the vector and remainder loops as original
3955   // loop iterations are now distributed among them. Note that original loop
3956   // represented by LoopScalarBody becomes remainder loop after vectorization.
3957   //
3958   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3959   // end up getting slightly roughened result but that should be OK since
3960   // profile is not inherently precise anyway. Note also possible bypass of
3961   // vector code caused by legality checks is ignored, assigning all the weight
3962   // to the vector loop, optimistically.
3963   //
3964   // For scalable vectorization we can't know at compile time how many iterations
3965   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3966   // vscale of '1'.
3967   setProfileInfoAfterUnrolling(
3968       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3969       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3970 }
3971 
3972 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3973   // In order to support recurrences we need to be able to vectorize Phi nodes.
3974   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3975   // stage #2: We now need to fix the recurrences by adding incoming edges to
3976   // the currently empty PHI nodes. At this point every instruction in the
3977   // original loop is widened to a vector form so we can use them to construct
3978   // the incoming edges.
3979   VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
3980   for (VPRecipeBase &R : Header->phis()) {
3981     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3982       fixReduction(ReductionPhi, State);
3983     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3984       fixFirstOrderRecurrence(FOR, State);
3985   }
3986 }
3987 
3988 void InnerLoopVectorizer::fixFirstOrderRecurrence(
3989     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3990   // This is the second phase of vectorizing first-order recurrences. An
3991   // overview of the transformation is described below. Suppose we have the
3992   // following loop.
3993   //
3994   //   for (int i = 0; i < n; ++i)
3995   //     b[i] = a[i] - a[i - 1];
3996   //
3997   // There is a first-order recurrence on "a". For this loop, the shorthand
3998   // scalar IR looks like:
3999   //
4000   //   scalar.ph:
4001   //     s_init = a[-1]
4002   //     br scalar.body
4003   //
4004   //   scalar.body:
4005   //     i = phi [0, scalar.ph], [i+1, scalar.body]
4006   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4007   //     s2 = a[i]
4008   //     b[i] = s2 - s1
4009   //     br cond, scalar.body, ...
4010   //
4011   // In this example, s1 is a recurrence because it's value depends on the
4012   // previous iteration. In the first phase of vectorization, we created a
4013   // vector phi v1 for s1. We now complete the vectorization and produce the
4014   // shorthand vector IR shown below (for VF = 4, UF = 1).
4015   //
4016   //   vector.ph:
4017   //     v_init = vector(..., ..., ..., a[-1])
4018   //     br vector.body
4019   //
4020   //   vector.body
4021   //     i = phi [0, vector.ph], [i+4, vector.body]
4022   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
4023   //     v2 = a[i, i+1, i+2, i+3];
4024   //     v3 = vector(v1(3), v2(0, 1, 2))
4025   //     b[i, i+1, i+2, i+3] = v2 - v3
4026   //     br cond, vector.body, middle.block
4027   //
4028   //   middle.block:
4029   //     x = v2(3)
4030   //     br scalar.ph
4031   //
4032   //   scalar.ph:
4033   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4034   //     br scalar.body
4035   //
4036   // After execution completes the vector loop, we extract the next value of
4037   // the recurrence (x) to use as the initial value in the scalar loop.
4038 
4039   // Extract the last vector element in the middle block. This will be the
4040   // initial value for the recurrence when jumping to the scalar loop.
4041   VPValue *PreviousDef = PhiR->getBackedgeValue();
4042   Value *Incoming = State.get(PreviousDef, UF - 1);
4043   auto *ExtractForScalar = Incoming;
4044   auto *IdxTy = Builder.getInt32Ty();
4045   if (VF.isVector()) {
4046     auto *One = ConstantInt::get(IdxTy, 1);
4047     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4048     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4049     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4050     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
4051                                                     "vector.recur.extract");
4052   }
4053   // Extract the second last element in the middle block if the
4054   // Phi is used outside the loop. We need to extract the phi itself
4055   // and not the last element (the phi update in the current iteration). This
4056   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4057   // when the scalar loop is not run at all.
4058   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4059   if (VF.isVector()) {
4060     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4061     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
4062     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4063         Incoming, Idx, "vector.recur.extract.for.phi");
4064   } else if (UF > 1)
4065     // When loop is unrolled without vectorizing, initialize
4066     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
4067     // of `Incoming`. This is analogous to the vectorized case above: extracting
4068     // the second last element when VF > 1.
4069     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4070 
4071   // Fix the initial value of the original recurrence in the scalar loop.
4072   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4073   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
4074   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4075   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
4076   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4077     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4078     Start->addIncoming(Incoming, BB);
4079   }
4080 
4081   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4082   Phi->setName("scalar.recur");
4083 
4084   // Finally, fix users of the recurrence outside the loop. The users will need
4085   // either the last value of the scalar recurrence or the last value of the
4086   // vector recurrence we extracted in the middle block. Since the loop is in
4087   // LCSSA form, we just need to find all the phi nodes for the original scalar
4088   // recurrence in the exit block, and then add an edge for the middle block.
4089   // Note that LCSSA does not imply single entry when the original scalar loop
4090   // had multiple exiting edges (as we always run the last iteration in the
4091   // scalar epilogue); in that case, there is no edge from middle to exit and
4092   // and thus no phis which needed updated.
4093   if (!Cost->requiresScalarEpilogue(VF))
4094     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4095       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
4096         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4097 }
4098 
4099 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
4100                                        VPTransformState &State) {
4101   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4102   // Get it's reduction variable descriptor.
4103   assert(Legal->isReductionVariable(OrigPhi) &&
4104          "Unable to find the reduction variable");
4105   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4106 
4107   RecurKind RK = RdxDesc.getRecurrenceKind();
4108   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4109   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4110   setDebugLocFromInst(ReductionStartValue);
4111 
4112   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
4113   // This is the vector-clone of the value that leaves the loop.
4114   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4115 
4116   // Wrap flags are in general invalid after vectorization, clear them.
4117   clearReductionWrapFlags(RdxDesc, State);
4118 
4119   // Before each round, move the insertion point right between
4120   // the PHIs and the values we are going to write.
4121   // This allows us to write both PHINodes and the extractelement
4122   // instructions.
4123   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4124 
4125   setDebugLocFromInst(LoopExitInst);
4126 
4127   Type *PhiTy = OrigPhi->getType();
4128   // If tail is folded by masking, the vector value to leave the loop should be
4129   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4130   // instead of the former. For an inloop reduction the reduction will already
4131   // be predicated, and does not need to be handled here.
4132   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
4133     for (unsigned Part = 0; Part < UF; ++Part) {
4134       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4135       Value *Sel = nullptr;
4136       for (User *U : VecLoopExitInst->users()) {
4137         if (isa<SelectInst>(U)) {
4138           assert(!Sel && "Reduction exit feeding two selects");
4139           Sel = U;
4140         } else
4141           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4142       }
4143       assert(Sel && "Reduction exit feeds no select");
4144       State.reset(LoopExitInstDef, Sel, Part);
4145 
4146       // If the target can create a predicated operator for the reduction at no
4147       // extra cost in the loop (for example a predicated vadd), it can be
4148       // cheaper for the select to remain in the loop than be sunk out of it,
4149       // and so use the select value for the phi instead of the old
4150       // LoopExitValue.
4151       if (PreferPredicatedReductionSelect ||
4152           TTI->preferPredicatedReductionSelect(
4153               RdxDesc.getOpcode(), PhiTy,
4154               TargetTransformInfo::ReductionFlags())) {
4155         auto *VecRdxPhi =
4156             cast<PHINode>(State.get(PhiR, Part));
4157         VecRdxPhi->setIncomingValueForBlock(
4158             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4159       }
4160     }
4161   }
4162 
4163   // If the vector reduction can be performed in a smaller type, we truncate
4164   // then extend the loop exit value to enable InstCombine to evaluate the
4165   // entire expression in the smaller type.
4166   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4167     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
4168     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4169     Builder.SetInsertPoint(
4170         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4171     VectorParts RdxParts(UF);
4172     for (unsigned Part = 0; Part < UF; ++Part) {
4173       RdxParts[Part] = State.get(LoopExitInstDef, Part);
4174       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4175       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4176                                         : Builder.CreateZExt(Trunc, VecTy);
4177       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
4178         if (U != Trunc) {
4179           U->replaceUsesOfWith(RdxParts[Part], Extnd);
4180           RdxParts[Part] = Extnd;
4181         }
4182     }
4183     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4184     for (unsigned Part = 0; Part < UF; ++Part) {
4185       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4186       State.reset(LoopExitInstDef, RdxParts[Part], Part);
4187     }
4188   }
4189 
4190   // Reduce all of the unrolled parts into a single vector.
4191   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4192   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4193 
4194   // The middle block terminator has already been assigned a DebugLoc here (the
4195   // OrigLoop's single latch terminator). We want the whole middle block to
4196   // appear to execute on this line because: (a) it is all compiler generated,
4197   // (b) these instructions are always executed after evaluating the latch
4198   // conditional branch, and (c) other passes may add new predecessors which
4199   // terminate on this line. This is the easiest way to ensure we don't
4200   // accidentally cause an extra step back into the loop while debugging.
4201   setDebugLocFromInst(LoopMiddleBlock->getTerminator());
4202   if (PhiR->isOrdered())
4203     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4204   else {
4205     // Floating-point operations should have some FMF to enable the reduction.
4206     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4207     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4208     for (unsigned Part = 1; Part < UF; ++Part) {
4209       Value *RdxPart = State.get(LoopExitInstDef, Part);
4210       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4211         ReducedPartRdx = Builder.CreateBinOp(
4212             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4213       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
4214         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
4215                                            ReducedPartRdx, RdxPart);
4216       else
4217         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4218     }
4219   }
4220 
4221   // Create the reduction after the loop. Note that inloop reductions create the
4222   // target reduction in the loop using a Reduction recipe.
4223   if (VF.isVector() && !PhiR->isInLoop()) {
4224     ReducedPartRdx =
4225         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
4226     // If the reduction can be performed in a smaller type, we need to extend
4227     // the reduction to the wider type before we branch to the original loop.
4228     if (PhiTy != RdxDesc.getRecurrenceType())
4229       ReducedPartRdx = RdxDesc.isSigned()
4230                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4231                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4232   }
4233 
4234   PHINode *ResumePhi =
4235       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
4236 
4237   // Create a phi node that merges control-flow from the backedge-taken check
4238   // block and the middle block.
4239   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4240                                         LoopScalarPreHeader->getTerminator());
4241 
4242   // If we are fixing reductions in the epilogue loop then we should already
4243   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
4244   // we carry over the incoming values correctly.
4245   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
4246     if (Incoming == LoopMiddleBlock)
4247       BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
4248     else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
4249       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
4250                               Incoming);
4251     else
4252       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4253   }
4254 
4255   // Set the resume value for this reduction
4256   ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4257 
4258   // Now, we need to fix the users of the reduction variable
4259   // inside and outside of the scalar remainder loop.
4260 
4261   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4262   // in the exit blocks.  See comment on analogous loop in
4263   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4264   if (!Cost->requiresScalarEpilogue(VF))
4265     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4266       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
4267         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4268 
4269   // Fix the scalar loop reduction variable with the incoming reduction sum
4270   // from the vector body and from the backedge value.
4271   int IncomingEdgeBlockIdx =
4272       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4273   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4274   // Pick the other block.
4275   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4276   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4277   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4278 }
4279 
4280 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4281                                                   VPTransformState &State) {
4282   RecurKind RK = RdxDesc.getRecurrenceKind();
4283   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4284     return;
4285 
4286   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4287   assert(LoopExitInstr && "null loop exit instruction");
4288   SmallVector<Instruction *, 8> Worklist;
4289   SmallPtrSet<Instruction *, 8> Visited;
4290   Worklist.push_back(LoopExitInstr);
4291   Visited.insert(LoopExitInstr);
4292 
4293   while (!Worklist.empty()) {
4294     Instruction *Cur = Worklist.pop_back_val();
4295     if (isa<OverflowingBinaryOperator>(Cur))
4296       for (unsigned Part = 0; Part < UF; ++Part) {
4297         // FIXME: Should not rely on getVPValue at this point.
4298         Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
4299         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4300       }
4301 
4302     for (User *U : Cur->users()) {
4303       Instruction *UI = cast<Instruction>(U);
4304       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4305           Visited.insert(UI).second)
4306         Worklist.push_back(UI);
4307     }
4308   }
4309 }
4310 
4311 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4312   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4313     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4314       // Some phis were already hand updated by the reduction and recurrence
4315       // code above, leave them alone.
4316       continue;
4317 
4318     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4319     // Non-instruction incoming values will have only one value.
4320 
4321     VPLane Lane = VPLane::getFirstLane();
4322     if (isa<Instruction>(IncomingValue) &&
4323         !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4324                                            VF))
4325       Lane = VPLane::getLastLaneForVF(VF);
4326 
4327     // Can be a loop invariant incoming value or the last scalar value to be
4328     // extracted from the vectorized loop.
4329     // FIXME: Should not rely on getVPValue at this point.
4330     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4331     Value *lastIncomingValue =
4332         OrigLoop->isLoopInvariant(IncomingValue)
4333             ? IncomingValue
4334             : State.get(State.Plan->getVPValue(IncomingValue, true),
4335                         VPIteration(UF - 1, Lane));
4336     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4337   }
4338 }
4339 
4340 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4341   // The basic block and loop containing the predicated instruction.
4342   auto *PredBB = PredInst->getParent();
4343   auto *VectorLoop = LI->getLoopFor(PredBB);
4344 
4345   // Initialize a worklist with the operands of the predicated instruction.
4346   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4347 
4348   // Holds instructions that we need to analyze again. An instruction may be
4349   // reanalyzed if we don't yet know if we can sink it or not.
4350   SmallVector<Instruction *, 8> InstsToReanalyze;
4351 
4352   // Returns true if a given use occurs in the predicated block. Phi nodes use
4353   // their operands in their corresponding predecessor blocks.
4354   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4355     auto *I = cast<Instruction>(U.getUser());
4356     BasicBlock *BB = I->getParent();
4357     if (auto *Phi = dyn_cast<PHINode>(I))
4358       BB = Phi->getIncomingBlock(
4359           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4360     return BB == PredBB;
4361   };
4362 
4363   // Iteratively sink the scalarized operands of the predicated instruction
4364   // into the block we created for it. When an instruction is sunk, it's
4365   // operands are then added to the worklist. The algorithm ends after one pass
4366   // through the worklist doesn't sink a single instruction.
4367   bool Changed;
4368   do {
4369     // Add the instructions that need to be reanalyzed to the worklist, and
4370     // reset the changed indicator.
4371     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4372     InstsToReanalyze.clear();
4373     Changed = false;
4374 
4375     while (!Worklist.empty()) {
4376       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4377 
4378       // We can't sink an instruction if it is a phi node, is not in the loop,
4379       // or may have side effects.
4380       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4381           I->mayHaveSideEffects())
4382         continue;
4383 
4384       // If the instruction is already in PredBB, check if we can sink its
4385       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4386       // sinking the scalar instruction I, hence it appears in PredBB; but it
4387       // may have failed to sink I's operands (recursively), which we try
4388       // (again) here.
4389       if (I->getParent() == PredBB) {
4390         Worklist.insert(I->op_begin(), I->op_end());
4391         continue;
4392       }
4393 
4394       // It's legal to sink the instruction if all its uses occur in the
4395       // predicated block. Otherwise, there's nothing to do yet, and we may
4396       // need to reanalyze the instruction.
4397       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4398         InstsToReanalyze.push_back(I);
4399         continue;
4400       }
4401 
4402       // Move the instruction to the beginning of the predicated block, and add
4403       // it's operands to the worklist.
4404       I->moveBefore(&*PredBB->getFirstInsertionPt());
4405       Worklist.insert(I->op_begin(), I->op_end());
4406 
4407       // The sinking may have enabled other instructions to be sunk, so we will
4408       // need to iterate.
4409       Changed = true;
4410     }
4411   } while (Changed);
4412 }
4413 
4414 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4415   for (PHINode *OrigPhi : OrigPHIsToFix) {
4416     VPWidenPHIRecipe *VPPhi =
4417         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4418     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4419     // Make sure the builder has a valid insert point.
4420     Builder.SetInsertPoint(NewPhi);
4421     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4422       VPValue *Inc = VPPhi->getIncomingValue(i);
4423       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4424       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4425     }
4426   }
4427 }
4428 
4429 bool InnerLoopVectorizer::useOrderedReductions(
4430     const RecurrenceDescriptor &RdxDesc) {
4431   return Cost->useOrderedReductions(RdxDesc);
4432 }
4433 
4434 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4435                                               VPWidenPHIRecipe *PhiR,
4436                                               VPTransformState &State) {
4437   PHINode *P = cast<PHINode>(PN);
4438   if (EnableVPlanNativePath) {
4439     // Currently we enter here in the VPlan-native path for non-induction
4440     // PHIs where all control flow is uniform. We simply widen these PHIs.
4441     // Create a vector phi with no operands - the vector phi operands will be
4442     // set at the end of vector code generation.
4443     Type *VecTy = (State.VF.isScalar())
4444                       ? PN->getType()
4445                       : VectorType::get(PN->getType(), State.VF);
4446     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4447     State.set(PhiR, VecPhi, 0);
4448     OrigPHIsToFix.push_back(P);
4449 
4450     return;
4451   }
4452 
4453   assert(PN->getParent() == OrigLoop->getHeader() &&
4454          "Non-header phis should have been handled elsewhere");
4455 
4456   // In order to support recurrences we need to be able to vectorize Phi nodes.
4457   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4458   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4459   // this value when we vectorize all of the instructions that use the PHI.
4460 
4461   assert(!Legal->isReductionVariable(P) &&
4462          "reductions should be handled elsewhere");
4463 
4464   setDebugLocFromInst(P);
4465 
4466   // This PHINode must be an induction variable.
4467   // Make sure that we know about it.
4468   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4469 
4470   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4471   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4472 
4473   auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV();
4474   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
4475 
4476   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4477   // which can be found from the original scalar operations.
4478   switch (II.getKind()) {
4479   case InductionDescriptor::IK_NoInduction:
4480     llvm_unreachable("Unknown induction");
4481   case InductionDescriptor::IK_IntInduction:
4482   case InductionDescriptor::IK_FpInduction:
4483     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4484   case InductionDescriptor::IK_PtrInduction: {
4485     // Handle the pointer induction variable case.
4486     assert(P->getType()->isPointerTy() && "Unexpected type.");
4487 
4488     if (Cost->isScalarAfterVectorization(P, State.VF)) {
4489       // This is the normalized GEP that starts counting at zero.
4490       Value *PtrInd =
4491           Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType());
4492       // Determine the number of scalars we need to generate for each unroll
4493       // iteration. If the instruction is uniform, we only need to generate the
4494       // first lane. Otherwise, we generate all VF values.
4495       bool IsUniform = vputils::onlyFirstLaneUsed(PhiR);
4496       assert((IsUniform || !State.VF.isScalable()) &&
4497              "Cannot scalarize a scalable VF");
4498       unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
4499 
4500       for (unsigned Part = 0; Part < UF; ++Part) {
4501         Value *PartStart =
4502             createStepForVF(Builder, PtrInd->getType(), VF, Part);
4503 
4504         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4505           Value *Idx = Builder.CreateAdd(
4506               PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4507           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4508 
4509           Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
4510                                         State.CFG.PrevBB->getTerminator());
4511           Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, Step, II);
4512           SclrGep->setName("next.gep");
4513           State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4514         }
4515       }
4516       return;
4517     }
4518     assert(isa<SCEVConstant>(II.getStep()) &&
4519            "Induction step not a SCEV constant!");
4520     Type *PhiType = II.getStep()->getType();
4521 
4522     // Build a pointer phi
4523     Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue();
4524     Type *ScStValueType = ScalarStartValue->getType();
4525     PHINode *NewPointerPhi =
4526         PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
4527     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4528 
4529     // A pointer induction, performed by using a gep
4530     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4531     Instruction *InductionLoc = LoopLatch->getTerminator();
4532     const SCEV *ScalarStep = II.getStep();
4533     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4534     Value *ScalarStepValue =
4535         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4536     Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4537     Value *NumUnrolledElems =
4538         Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4539     Value *InductionGEP = GetElementPtrInst::Create(
4540         II.getElementType(), NewPointerPhi,
4541         Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4542         InductionLoc);
4543     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4544 
4545     // Create UF many actual address geps that use the pointer
4546     // phi as base and a vectorized version of the step value
4547     // (<step*0, ..., step*N>) as offset.
4548     for (unsigned Part = 0; Part < State.UF; ++Part) {
4549       Type *VecPhiType = VectorType::get(PhiType, State.VF);
4550       Value *StartOffsetScalar =
4551           Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4552       Value *StartOffset =
4553           Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4554       // Create a vector of consecutive numbers from zero to VF.
4555       StartOffset =
4556           Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4557 
4558       Value *GEP = Builder.CreateGEP(
4559           II.getElementType(), NewPointerPhi,
4560           Builder.CreateMul(
4561               StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4562               "vector.gep"));
4563       State.set(PhiR, GEP, Part);
4564     }
4565   }
4566   }
4567 }
4568 
4569 /// A helper function for checking whether an integer division-related
4570 /// instruction may divide by zero (in which case it must be predicated if
4571 /// executed conditionally in the scalar code).
4572 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4573 /// Non-zero divisors that are non compile-time constants will not be
4574 /// converted into multiplication, so we will still end up scalarizing
4575 /// the division, but can do so w/o predication.
4576 static bool mayDivideByZero(Instruction &I) {
4577   assert((I.getOpcode() == Instruction::UDiv ||
4578           I.getOpcode() == Instruction::SDiv ||
4579           I.getOpcode() == Instruction::URem ||
4580           I.getOpcode() == Instruction::SRem) &&
4581          "Unexpected instruction");
4582   Value *Divisor = I.getOperand(1);
4583   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4584   return !CInt || CInt->isZero();
4585 }
4586 
4587 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4588                                                VPUser &ArgOperands,
4589                                                VPTransformState &State) {
4590   assert(!isa<DbgInfoIntrinsic>(I) &&
4591          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4592   setDebugLocFromInst(&I);
4593 
4594   Module *M = I.getParent()->getParent()->getParent();
4595   auto *CI = cast<CallInst>(&I);
4596 
4597   SmallVector<Type *, 4> Tys;
4598   for (Value *ArgOperand : CI->args())
4599     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4600 
4601   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4602 
4603   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4604   // version of the instruction.
4605   // Is it beneficial to perform intrinsic call compared to lib call?
4606   bool NeedToScalarize = false;
4607   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4608   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4609   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4610   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4611          "Instruction should be scalarized elsewhere.");
4612   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4613          "Either the intrinsic cost or vector call cost must be valid");
4614 
4615   for (unsigned Part = 0; Part < UF; ++Part) {
4616     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4617     SmallVector<Value *, 4> Args;
4618     for (auto &I : enumerate(ArgOperands.operands())) {
4619       // Some intrinsics have a scalar argument - don't replace it with a
4620       // vector.
4621       Value *Arg;
4622       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4623         Arg = State.get(I.value(), Part);
4624       else {
4625         Arg = State.get(I.value(), VPIteration(0, 0));
4626         if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
4627           TysForDecl.push_back(Arg->getType());
4628       }
4629       Args.push_back(Arg);
4630     }
4631 
4632     Function *VectorF;
4633     if (UseVectorIntrinsic) {
4634       // Use vector version of the intrinsic.
4635       if (VF.isVector())
4636         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4637       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4638       assert(VectorF && "Can't retrieve vector intrinsic.");
4639     } else {
4640       // Use vector version of the function call.
4641       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4642 #ifndef NDEBUG
4643       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4644              "Can't create vector function.");
4645 #endif
4646         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4647     }
4648       SmallVector<OperandBundleDef, 1> OpBundles;
4649       CI->getOperandBundlesAsDefs(OpBundles);
4650       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4651 
4652       if (isa<FPMathOperator>(V))
4653         V->copyFastMathFlags(CI);
4654 
4655       State.set(Def, V, Part);
4656       addMetadata(V, &I);
4657   }
4658 }
4659 
4660 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4661   // We should not collect Scalars more than once per VF. Right now, this
4662   // function is called from collectUniformsAndScalars(), which already does
4663   // this check. Collecting Scalars for VF=1 does not make any sense.
4664   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4665          "This function should not be visited twice for the same VF");
4666 
4667   SmallSetVector<Instruction *, 8> Worklist;
4668 
4669   // These sets are used to seed the analysis with pointers used by memory
4670   // accesses that will remain scalar.
4671   SmallSetVector<Instruction *, 8> ScalarPtrs;
4672   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4673   auto *Latch = TheLoop->getLoopLatch();
4674 
4675   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4676   // The pointer operands of loads and stores will be scalar as long as the
4677   // memory access is not a gather or scatter operation. The value operand of a
4678   // store will remain scalar if the store is scalarized.
4679   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4680     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4681     assert(WideningDecision != CM_Unknown &&
4682            "Widening decision should be ready at this moment");
4683     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4684       if (Ptr == Store->getValueOperand())
4685         return WideningDecision == CM_Scalarize;
4686     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4687            "Ptr is neither a value or pointer operand");
4688     return WideningDecision != CM_GatherScatter;
4689   };
4690 
4691   // A helper that returns true if the given value is a bitcast or
4692   // getelementptr instruction contained in the loop.
4693   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4694     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4695             isa<GetElementPtrInst>(V)) &&
4696            !TheLoop->isLoopInvariant(V);
4697   };
4698 
4699   // A helper that evaluates a memory access's use of a pointer. If the use will
4700   // be a scalar use and the pointer is only used by memory accesses, we place
4701   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4702   // PossibleNonScalarPtrs.
4703   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4704     // We only care about bitcast and getelementptr instructions contained in
4705     // the loop.
4706     if (!isLoopVaryingBitCastOrGEP(Ptr))
4707       return;
4708 
4709     // If the pointer has already been identified as scalar (e.g., if it was
4710     // also identified as uniform), there's nothing to do.
4711     auto *I = cast<Instruction>(Ptr);
4712     if (Worklist.count(I))
4713       return;
4714 
4715     // If the use of the pointer will be a scalar use, and all users of the
4716     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4717     // place the pointer in PossibleNonScalarPtrs.
4718     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4719           return isa<LoadInst>(U) || isa<StoreInst>(U);
4720         }))
4721       ScalarPtrs.insert(I);
4722     else
4723       PossibleNonScalarPtrs.insert(I);
4724   };
4725 
4726   // We seed the scalars analysis with three classes of instructions: (1)
4727   // instructions marked uniform-after-vectorization and (2) bitcast,
4728   // getelementptr and (pointer) phi instructions used by memory accesses
4729   // requiring a scalar use.
4730   //
4731   // (1) Add to the worklist all instructions that have been identified as
4732   // uniform-after-vectorization.
4733   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4734 
4735   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4736   // memory accesses requiring a scalar use. The pointer operands of loads and
4737   // stores will be scalar as long as the memory accesses is not a gather or
4738   // scatter operation. The value operand of a store will remain scalar if the
4739   // store is scalarized.
4740   for (auto *BB : TheLoop->blocks())
4741     for (auto &I : *BB) {
4742       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4743         evaluatePtrUse(Load, Load->getPointerOperand());
4744       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4745         evaluatePtrUse(Store, Store->getPointerOperand());
4746         evaluatePtrUse(Store, Store->getValueOperand());
4747       }
4748     }
4749   for (auto *I : ScalarPtrs)
4750     if (!PossibleNonScalarPtrs.count(I)) {
4751       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4752       Worklist.insert(I);
4753     }
4754 
4755   // Insert the forced scalars.
4756   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4757   // induction variable when the PHI user is scalarized.
4758   auto ForcedScalar = ForcedScalars.find(VF);
4759   if (ForcedScalar != ForcedScalars.end())
4760     for (auto *I : ForcedScalar->second)
4761       Worklist.insert(I);
4762 
4763   // Expand the worklist by looking through any bitcasts and getelementptr
4764   // instructions we've already identified as scalar. This is similar to the
4765   // expansion step in collectLoopUniforms(); however, here we're only
4766   // expanding to include additional bitcasts and getelementptr instructions.
4767   unsigned Idx = 0;
4768   while (Idx != Worklist.size()) {
4769     Instruction *Dst = Worklist[Idx++];
4770     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4771       continue;
4772     auto *Src = cast<Instruction>(Dst->getOperand(0));
4773     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4774           auto *J = cast<Instruction>(U);
4775           return !TheLoop->contains(J) || Worklist.count(J) ||
4776                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4777                   isScalarUse(J, Src));
4778         })) {
4779       Worklist.insert(Src);
4780       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4781     }
4782   }
4783 
4784   // An induction variable will remain scalar if all users of the induction
4785   // variable and induction variable update remain scalar.
4786   for (auto &Induction : Legal->getInductionVars()) {
4787     auto *Ind = Induction.first;
4788     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4789 
4790     // If tail-folding is applied, the primary induction variable will be used
4791     // to feed a vector compare.
4792     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4793       continue;
4794 
4795     // Returns true if \p Indvar is a pointer induction that is used directly by
4796     // load/store instruction \p I.
4797     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4798                                               Instruction *I) {
4799       return Induction.second.getKind() ==
4800                  InductionDescriptor::IK_PtrInduction &&
4801              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4802              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4803     };
4804 
4805     // Determine if all users of the induction variable are scalar after
4806     // vectorization.
4807     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4808       auto *I = cast<Instruction>(U);
4809       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4810              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4811     });
4812     if (!ScalarInd)
4813       continue;
4814 
4815     // Determine if all users of the induction variable update instruction are
4816     // scalar after vectorization.
4817     auto ScalarIndUpdate =
4818         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4819           auto *I = cast<Instruction>(U);
4820           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4821                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4822         });
4823     if (!ScalarIndUpdate)
4824       continue;
4825 
4826     // The induction variable and its update instruction will remain scalar.
4827     Worklist.insert(Ind);
4828     Worklist.insert(IndUpdate);
4829     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4830     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4831                       << "\n");
4832   }
4833 
4834   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4835 }
4836 
4837 bool LoopVectorizationCostModel::isScalarWithPredication(
4838     Instruction *I, ElementCount VF) const {
4839   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4840     return false;
4841   switch(I->getOpcode()) {
4842   default:
4843     break;
4844   case Instruction::Load:
4845   case Instruction::Store: {
4846     if (!Legal->isMaskRequired(I))
4847       return false;
4848     auto *Ptr = getLoadStorePointerOperand(I);
4849     auto *Ty = getLoadStoreType(I);
4850     Type *VTy = Ty;
4851     if (VF.isVector())
4852       VTy = VectorType::get(Ty, VF);
4853     const Align Alignment = getLoadStoreAlignment(I);
4854     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4855                                 TTI.isLegalMaskedGather(VTy, Alignment))
4856                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4857                                 TTI.isLegalMaskedScatter(VTy, Alignment));
4858   }
4859   case Instruction::UDiv:
4860   case Instruction::SDiv:
4861   case Instruction::SRem:
4862   case Instruction::URem:
4863     return mayDivideByZero(*I);
4864   }
4865   return false;
4866 }
4867 
4868 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4869     Instruction *I, ElementCount VF) {
4870   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4871   assert(getWideningDecision(I, VF) == CM_Unknown &&
4872          "Decision should not be set yet.");
4873   auto *Group = getInterleavedAccessGroup(I);
4874   assert(Group && "Must have a group.");
4875 
4876   // If the instruction's allocated size doesn't equal it's type size, it
4877   // requires padding and will be scalarized.
4878   auto &DL = I->getModule()->getDataLayout();
4879   auto *ScalarTy = getLoadStoreType(I);
4880   if (hasIrregularType(ScalarTy, DL))
4881     return false;
4882 
4883   // Check if masking is required.
4884   // A Group may need masking for one of two reasons: it resides in a block that
4885   // needs predication, or it was decided to use masking to deal with gaps
4886   // (either a gap at the end of a load-access that may result in a speculative
4887   // load, or any gaps in a store-access).
4888   bool PredicatedAccessRequiresMasking =
4889       blockNeedsPredicationForAnyReason(I->getParent()) &&
4890       Legal->isMaskRequired(I);
4891   bool LoadAccessWithGapsRequiresEpilogMasking =
4892       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4893       !isScalarEpilogueAllowed();
4894   bool StoreAccessWithGapsRequiresMasking =
4895       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4896   if (!PredicatedAccessRequiresMasking &&
4897       !LoadAccessWithGapsRequiresEpilogMasking &&
4898       !StoreAccessWithGapsRequiresMasking)
4899     return true;
4900 
4901   // If masked interleaving is required, we expect that the user/target had
4902   // enabled it, because otherwise it either wouldn't have been created or
4903   // it should have been invalidated by the CostModel.
4904   assert(useMaskedInterleavedAccesses(TTI) &&
4905          "Masked interleave-groups for predicated accesses are not enabled.");
4906 
4907   if (Group->isReverse())
4908     return false;
4909 
4910   auto *Ty = getLoadStoreType(I);
4911   const Align Alignment = getLoadStoreAlignment(I);
4912   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4913                           : TTI.isLegalMaskedStore(Ty, Alignment);
4914 }
4915 
4916 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4917     Instruction *I, ElementCount VF) {
4918   // Get and ensure we have a valid memory instruction.
4919   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4920 
4921   auto *Ptr = getLoadStorePointerOperand(I);
4922   auto *ScalarTy = getLoadStoreType(I);
4923 
4924   // In order to be widened, the pointer should be consecutive, first of all.
4925   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4926     return false;
4927 
4928   // If the instruction is a store located in a predicated block, it will be
4929   // scalarized.
4930   if (isScalarWithPredication(I, VF))
4931     return false;
4932 
4933   // If the instruction's allocated size doesn't equal it's type size, it
4934   // requires padding and will be scalarized.
4935   auto &DL = I->getModule()->getDataLayout();
4936   if (hasIrregularType(ScalarTy, DL))
4937     return false;
4938 
4939   return true;
4940 }
4941 
4942 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4943   // We should not collect Uniforms more than once per VF. Right now,
4944   // this function is called from collectUniformsAndScalars(), which
4945   // already does this check. Collecting Uniforms for VF=1 does not make any
4946   // sense.
4947 
4948   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4949          "This function should not be visited twice for the same VF");
4950 
4951   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4952   // not analyze again.  Uniforms.count(VF) will return 1.
4953   Uniforms[VF].clear();
4954 
4955   // We now know that the loop is vectorizable!
4956   // Collect instructions inside the loop that will remain uniform after
4957   // vectorization.
4958 
4959   // Global values, params and instructions outside of current loop are out of
4960   // scope.
4961   auto isOutOfScope = [&](Value *V) -> bool {
4962     Instruction *I = dyn_cast<Instruction>(V);
4963     return (!I || !TheLoop->contains(I));
4964   };
4965 
4966   // Worklist containing uniform instructions demanding lane 0.
4967   SetVector<Instruction *> Worklist;
4968   BasicBlock *Latch = TheLoop->getLoopLatch();
4969 
4970   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4971   // that are scalar with predication must not be considered uniform after
4972   // vectorization, because that would create an erroneous replicating region
4973   // where only a single instance out of VF should be formed.
4974   // TODO: optimize such seldom cases if found important, see PR40816.
4975   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4976     if (isOutOfScope(I)) {
4977       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4978                         << *I << "\n");
4979       return;
4980     }
4981     if (isScalarWithPredication(I, VF)) {
4982       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4983                         << *I << "\n");
4984       return;
4985     }
4986     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4987     Worklist.insert(I);
4988   };
4989 
4990   // Start with the conditional branch. If the branch condition is an
4991   // instruction contained in the loop that is only used by the branch, it is
4992   // uniform.
4993   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4994   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4995     addToWorklistIfAllowed(Cmp);
4996 
4997   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4998     InstWidening WideningDecision = getWideningDecision(I, VF);
4999     assert(WideningDecision != CM_Unknown &&
5000            "Widening decision should be ready at this moment");
5001 
5002     // A uniform memory op is itself uniform.  We exclude uniform stores
5003     // here as they demand the last lane, not the first one.
5004     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5005       assert(WideningDecision == CM_Scalarize);
5006       return true;
5007     }
5008 
5009     return (WideningDecision == CM_Widen ||
5010             WideningDecision == CM_Widen_Reverse ||
5011             WideningDecision == CM_Interleave);
5012   };
5013 
5014 
5015   // Returns true if Ptr is the pointer operand of a memory access instruction
5016   // I, and I is known to not require scalarization.
5017   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5018     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5019   };
5020 
5021   // Holds a list of values which are known to have at least one uniform use.
5022   // Note that there may be other uses which aren't uniform.  A "uniform use"
5023   // here is something which only demands lane 0 of the unrolled iterations;
5024   // it does not imply that all lanes produce the same value (e.g. this is not
5025   // the usual meaning of uniform)
5026   SetVector<Value *> HasUniformUse;
5027 
5028   // Scan the loop for instructions which are either a) known to have only
5029   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5030   for (auto *BB : TheLoop->blocks())
5031     for (auto &I : *BB) {
5032       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
5033         switch (II->getIntrinsicID()) {
5034         case Intrinsic::sideeffect:
5035         case Intrinsic::experimental_noalias_scope_decl:
5036         case Intrinsic::assume:
5037         case Intrinsic::lifetime_start:
5038         case Intrinsic::lifetime_end:
5039           if (TheLoop->hasLoopInvariantOperands(&I))
5040             addToWorklistIfAllowed(&I);
5041           break;
5042         default:
5043           break;
5044         }
5045       }
5046 
5047       // ExtractValue instructions must be uniform, because the operands are
5048       // known to be loop-invariant.
5049       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
5050         assert(isOutOfScope(EVI->getAggregateOperand()) &&
5051                "Expected aggregate value to be loop invariant");
5052         addToWorklistIfAllowed(EVI);
5053         continue;
5054       }
5055 
5056       // If there's no pointer operand, there's nothing to do.
5057       auto *Ptr = getLoadStorePointerOperand(&I);
5058       if (!Ptr)
5059         continue;
5060 
5061       // A uniform memory op is itself uniform.  We exclude uniform stores
5062       // here as they demand the last lane, not the first one.
5063       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5064         addToWorklistIfAllowed(&I);
5065 
5066       if (isUniformDecision(&I, VF)) {
5067         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5068         HasUniformUse.insert(Ptr);
5069       }
5070     }
5071 
5072   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5073   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5074   // disallows uses outside the loop as well.
5075   for (auto *V : HasUniformUse) {
5076     if (isOutOfScope(V))
5077       continue;
5078     auto *I = cast<Instruction>(V);
5079     auto UsersAreMemAccesses =
5080       llvm::all_of(I->users(), [&](User *U) -> bool {
5081         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5082       });
5083     if (UsersAreMemAccesses)
5084       addToWorklistIfAllowed(I);
5085   }
5086 
5087   // Expand Worklist in topological order: whenever a new instruction
5088   // is added , its users should be already inside Worklist.  It ensures
5089   // a uniform instruction will only be used by uniform instructions.
5090   unsigned idx = 0;
5091   while (idx != Worklist.size()) {
5092     Instruction *I = Worklist[idx++];
5093 
5094     for (auto OV : I->operand_values()) {
5095       // isOutOfScope operands cannot be uniform instructions.
5096       if (isOutOfScope(OV))
5097         continue;
5098       // First order recurrence Phi's should typically be considered
5099       // non-uniform.
5100       auto *OP = dyn_cast<PHINode>(OV);
5101       if (OP && Legal->isFirstOrderRecurrence(OP))
5102         continue;
5103       // If all the users of the operand are uniform, then add the
5104       // operand into the uniform worklist.
5105       auto *OI = cast<Instruction>(OV);
5106       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5107             auto *J = cast<Instruction>(U);
5108             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5109           }))
5110         addToWorklistIfAllowed(OI);
5111     }
5112   }
5113 
5114   // For an instruction to be added into Worklist above, all its users inside
5115   // the loop should also be in Worklist. However, this condition cannot be
5116   // true for phi nodes that form a cyclic dependence. We must process phi
5117   // nodes separately. An induction variable will remain uniform if all users
5118   // of the induction variable and induction variable update remain uniform.
5119   // The code below handles both pointer and non-pointer induction variables.
5120   for (auto &Induction : Legal->getInductionVars()) {
5121     auto *Ind = Induction.first;
5122     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5123 
5124     // Determine if all users of the induction variable are uniform after
5125     // vectorization.
5126     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5127       auto *I = cast<Instruction>(U);
5128       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5129              isVectorizedMemAccessUse(I, Ind);
5130     });
5131     if (!UniformInd)
5132       continue;
5133 
5134     // Determine if all users of the induction variable update instruction are
5135     // uniform after vectorization.
5136     auto UniformIndUpdate =
5137         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5138           auto *I = cast<Instruction>(U);
5139           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5140                  isVectorizedMemAccessUse(I, IndUpdate);
5141         });
5142     if (!UniformIndUpdate)
5143       continue;
5144 
5145     // The induction variable and its update instruction will remain uniform.
5146     addToWorklistIfAllowed(Ind);
5147     addToWorklistIfAllowed(IndUpdate);
5148   }
5149 
5150   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5151 }
5152 
5153 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5154   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5155 
5156   if (Legal->getRuntimePointerChecking()->Need) {
5157     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5158         "runtime pointer checks needed. Enable vectorization of this "
5159         "loop with '#pragma clang loop vectorize(enable)' when "
5160         "compiling with -Os/-Oz",
5161         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5162     return true;
5163   }
5164 
5165   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5166     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5167         "runtime SCEV checks needed. Enable vectorization of this "
5168         "loop with '#pragma clang loop vectorize(enable)' when "
5169         "compiling with -Os/-Oz",
5170         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5171     return true;
5172   }
5173 
5174   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5175   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5176     reportVectorizationFailure("Runtime stride check for small trip count",
5177         "runtime stride == 1 checks needed. Enable vectorization of "
5178         "this loop without such check by compiling with -Os/-Oz",
5179         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5180     return true;
5181   }
5182 
5183   return false;
5184 }
5185 
5186 ElementCount
5187 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
5188   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
5189     return ElementCount::getScalable(0);
5190 
5191   if (Hints->isScalableVectorizationDisabled()) {
5192     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
5193                             "ScalableVectorizationDisabled", ORE, TheLoop);
5194     return ElementCount::getScalable(0);
5195   }
5196 
5197   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
5198 
5199   auto MaxScalableVF = ElementCount::getScalable(
5200       std::numeric_limits<ElementCount::ScalarTy>::max());
5201 
5202   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
5203   // FIXME: While for scalable vectors this is currently sufficient, this should
5204   // be replaced by a more detailed mechanism that filters out specific VFs,
5205   // instead of invalidating vectorization for a whole set of VFs based on the
5206   // MaxVF.
5207 
5208   // Disable scalable vectorization if the loop contains unsupported reductions.
5209   if (!canVectorizeReductions(MaxScalableVF)) {
5210     reportVectorizationInfo(
5211         "Scalable vectorization not supported for the reduction "
5212         "operations found in this loop.",
5213         "ScalableVFUnfeasible", ORE, TheLoop);
5214     return ElementCount::getScalable(0);
5215   }
5216 
5217   // Disable scalable vectorization if the loop contains any instructions
5218   // with element types not supported for scalable vectors.
5219   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
5220         return !Ty->isVoidTy() &&
5221                !this->TTI.isElementTypeLegalForScalableVector(Ty);
5222       })) {
5223     reportVectorizationInfo("Scalable vectorization is not supported "
5224                             "for all element types found in this loop.",
5225                             "ScalableVFUnfeasible", ORE, TheLoop);
5226     return ElementCount::getScalable(0);
5227   }
5228 
5229   if (Legal->isSafeForAnyVectorWidth())
5230     return MaxScalableVF;
5231 
5232   // Limit MaxScalableVF by the maximum safe dependence distance.
5233   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5234   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
5235     MaxVScale =
5236         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
5237   MaxScalableVF = ElementCount::getScalable(
5238       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5239   if (!MaxScalableVF)
5240     reportVectorizationInfo(
5241         "Max legal vector width too small, scalable vectorization "
5242         "unfeasible.",
5243         "ScalableVFUnfeasible", ORE, TheLoop);
5244 
5245   return MaxScalableVF;
5246 }
5247 
5248 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
5249     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
5250   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5251   unsigned SmallestType, WidestType;
5252   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5253 
5254   // Get the maximum safe dependence distance in bits computed by LAA.
5255   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5256   // the memory accesses that is most restrictive (involved in the smallest
5257   // dependence distance).
5258   unsigned MaxSafeElements =
5259       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
5260 
5261   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
5262   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
5263 
5264   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
5265                     << ".\n");
5266   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
5267                     << ".\n");
5268 
5269   // First analyze the UserVF, fall back if the UserVF should be ignored.
5270   if (UserVF) {
5271     auto MaxSafeUserVF =
5272         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
5273 
5274     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
5275       // If `VF=vscale x N` is safe, then so is `VF=N`
5276       if (UserVF.isScalable())
5277         return FixedScalableVFPair(
5278             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
5279       else
5280         return UserVF;
5281     }
5282 
5283     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
5284 
5285     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
5286     // is better to ignore the hint and let the compiler choose a suitable VF.
5287     if (!UserVF.isScalable()) {
5288       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5289                         << " is unsafe, clamping to max safe VF="
5290                         << MaxSafeFixedVF << ".\n");
5291       ORE->emit([&]() {
5292         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5293                                           TheLoop->getStartLoc(),
5294                                           TheLoop->getHeader())
5295                << "User-specified vectorization factor "
5296                << ore::NV("UserVectorizationFactor", UserVF)
5297                << " is unsafe, clamping to maximum safe vectorization factor "
5298                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
5299       });
5300       return MaxSafeFixedVF;
5301     }
5302 
5303     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
5304       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5305                         << " is ignored because scalable vectors are not "
5306                            "available.\n");
5307       ORE->emit([&]() {
5308         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5309                                           TheLoop->getStartLoc(),
5310                                           TheLoop->getHeader())
5311                << "User-specified vectorization factor "
5312                << ore::NV("UserVectorizationFactor", UserVF)
5313                << " is ignored because the target does not support scalable "
5314                   "vectors. The compiler will pick a more suitable value.";
5315       });
5316     } else {
5317       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5318                         << " is unsafe. Ignoring scalable UserVF.\n");
5319       ORE->emit([&]() {
5320         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5321                                           TheLoop->getStartLoc(),
5322                                           TheLoop->getHeader())
5323                << "User-specified vectorization factor "
5324                << ore::NV("UserVectorizationFactor", UserVF)
5325                << " is unsafe. Ignoring the hint to let the compiler pick a "
5326                   "more suitable value.";
5327       });
5328     }
5329   }
5330 
5331   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5332                     << " / " << WidestType << " bits.\n");
5333 
5334   FixedScalableVFPair Result(ElementCount::getFixed(1),
5335                              ElementCount::getScalable(0));
5336   if (auto MaxVF =
5337           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5338                                   MaxSafeFixedVF, FoldTailByMasking))
5339     Result.FixedVF = MaxVF;
5340 
5341   if (auto MaxVF =
5342           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5343                                   MaxSafeScalableVF, FoldTailByMasking))
5344     if (MaxVF.isScalable()) {
5345       Result.ScalableVF = MaxVF;
5346       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5347                         << "\n");
5348     }
5349 
5350   return Result;
5351 }
5352 
5353 FixedScalableVFPair
5354 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5355   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5356     // TODO: It may by useful to do since it's still likely to be dynamically
5357     // uniform if the target can skip.
5358     reportVectorizationFailure(
5359         "Not inserting runtime ptr check for divergent target",
5360         "runtime pointer checks needed. Not enabled for divergent target",
5361         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5362     return FixedScalableVFPair::getNone();
5363   }
5364 
5365   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5366   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5367   if (TC == 1) {
5368     reportVectorizationFailure("Single iteration (non) loop",
5369         "loop trip count is one, irrelevant for vectorization",
5370         "SingleIterationLoop", ORE, TheLoop);
5371     return FixedScalableVFPair::getNone();
5372   }
5373 
5374   switch (ScalarEpilogueStatus) {
5375   case CM_ScalarEpilogueAllowed:
5376     return computeFeasibleMaxVF(TC, UserVF, false);
5377   case CM_ScalarEpilogueNotAllowedUsePredicate:
5378     LLVM_FALLTHROUGH;
5379   case CM_ScalarEpilogueNotNeededUsePredicate:
5380     LLVM_DEBUG(
5381         dbgs() << "LV: vector predicate hint/switch found.\n"
5382                << "LV: Not allowing scalar epilogue, creating predicated "
5383                << "vector loop.\n");
5384     break;
5385   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5386     // fallthrough as a special case of OptForSize
5387   case CM_ScalarEpilogueNotAllowedOptSize:
5388     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5389       LLVM_DEBUG(
5390           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5391     else
5392       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5393                         << "count.\n");
5394 
5395     // Bail if runtime checks are required, which are not good when optimising
5396     // for size.
5397     if (runtimeChecksRequired())
5398       return FixedScalableVFPair::getNone();
5399 
5400     break;
5401   }
5402 
5403   // The only loops we can vectorize without a scalar epilogue, are loops with
5404   // a bottom-test and a single exiting block. We'd have to handle the fact
5405   // that not every instruction executes on the last iteration.  This will
5406   // require a lane mask which varies through the vector loop body.  (TODO)
5407   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5408     // If there was a tail-folding hint/switch, but we can't fold the tail by
5409     // masking, fallback to a vectorization with a scalar epilogue.
5410     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5411       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5412                            "scalar epilogue instead.\n");
5413       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5414       return computeFeasibleMaxVF(TC, UserVF, false);
5415     }
5416     return FixedScalableVFPair::getNone();
5417   }
5418 
5419   // Now try the tail folding
5420 
5421   // Invalidate interleave groups that require an epilogue if we can't mask
5422   // the interleave-group.
5423   if (!useMaskedInterleavedAccesses(TTI)) {
5424     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5425            "No decisions should have been taken at this point");
5426     // Note: There is no need to invalidate any cost modeling decisions here, as
5427     // non where taken so far.
5428     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5429   }
5430 
5431   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5432   // Avoid tail folding if the trip count is known to be a multiple of any VF
5433   // we chose.
5434   // FIXME: The condition below pessimises the case for fixed-width vectors,
5435   // when scalable VFs are also candidates for vectorization.
5436   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5437     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5438     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5439            "MaxFixedVF must be a power of 2");
5440     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5441                                    : MaxFixedVF.getFixedValue();
5442     ScalarEvolution *SE = PSE.getSE();
5443     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5444     const SCEV *ExitCount = SE->getAddExpr(
5445         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5446     const SCEV *Rem = SE->getURemExpr(
5447         SE->applyLoopGuards(ExitCount, TheLoop),
5448         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5449     if (Rem->isZero()) {
5450       // Accept MaxFixedVF if we do not have a tail.
5451       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5452       return MaxFactors;
5453     }
5454   }
5455 
5456   // For scalable vectors don't use tail folding for low trip counts or
5457   // optimizing for code size. We only permit this if the user has explicitly
5458   // requested it.
5459   if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate &&
5460       ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate &&
5461       MaxFactors.ScalableVF.isVector())
5462     MaxFactors.ScalableVF = ElementCount::getScalable(0);
5463 
5464   // If we don't know the precise trip count, or if the trip count that we
5465   // found modulo the vectorization factor is not zero, try to fold the tail
5466   // by masking.
5467   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5468   if (Legal->prepareToFoldTailByMasking()) {
5469     FoldTailByMasking = true;
5470     return MaxFactors;
5471   }
5472 
5473   // If there was a tail-folding hint/switch, but we can't fold the tail by
5474   // masking, fallback to a vectorization with a scalar epilogue.
5475   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5476     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5477                          "scalar epilogue instead.\n");
5478     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5479     return MaxFactors;
5480   }
5481 
5482   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5483     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5484     return FixedScalableVFPair::getNone();
5485   }
5486 
5487   if (TC == 0) {
5488     reportVectorizationFailure(
5489         "Unable to calculate the loop count due to complex control flow",
5490         "unable to calculate the loop count due to complex control flow",
5491         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5492     return FixedScalableVFPair::getNone();
5493   }
5494 
5495   reportVectorizationFailure(
5496       "Cannot optimize for size and vectorize at the same time.",
5497       "cannot optimize for size and vectorize at the same time. "
5498       "Enable vectorization of this loop with '#pragma clang loop "
5499       "vectorize(enable)' when compiling with -Os/-Oz",
5500       "NoTailLoopWithOptForSize", ORE, TheLoop);
5501   return FixedScalableVFPair::getNone();
5502 }
5503 
5504 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5505     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5506     const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
5507   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5508   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5509       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5510                            : TargetTransformInfo::RGK_FixedWidthVector);
5511 
5512   // Convenience function to return the minimum of two ElementCounts.
5513   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5514     assert((LHS.isScalable() == RHS.isScalable()) &&
5515            "Scalable flags must match");
5516     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5517   };
5518 
5519   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5520   // Note that both WidestRegister and WidestType may not be a powers of 2.
5521   auto MaxVectorElementCount = ElementCount::get(
5522       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5523       ComputeScalableMaxVF);
5524   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5525   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5526                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5527 
5528   if (!MaxVectorElementCount) {
5529     LLVM_DEBUG(dbgs() << "LV: The target has no "
5530                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5531                       << " vector registers.\n");
5532     return ElementCount::getFixed(1);
5533   }
5534 
5535   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5536   if (ConstTripCount &&
5537       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5538       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5539     // If loop trip count (TC) is known at compile time there is no point in
5540     // choosing VF greater than TC (as done in the loop below). Select maximum
5541     // power of two which doesn't exceed TC.
5542     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5543     // when the TC is less than or equal to the known number of lanes.
5544     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5545     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5546                          "exceeding the constant trip count: "
5547                       << ClampedConstTripCount << "\n");
5548     return ElementCount::getFixed(ClampedConstTripCount);
5549   }
5550 
5551   ElementCount MaxVF = MaxVectorElementCount;
5552   if (TTI.shouldMaximizeVectorBandwidth() ||
5553       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5554     auto MaxVectorElementCountMaxBW = ElementCount::get(
5555         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5556         ComputeScalableMaxVF);
5557     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5558 
5559     // Collect all viable vectorization factors larger than the default MaxVF
5560     // (i.e. MaxVectorElementCount).
5561     SmallVector<ElementCount, 8> VFs;
5562     for (ElementCount VS = MaxVectorElementCount * 2;
5563          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5564       VFs.push_back(VS);
5565 
5566     // For each VF calculate its register usage.
5567     auto RUs = calculateRegisterUsage(VFs);
5568 
5569     // Select the largest VF which doesn't require more registers than existing
5570     // ones.
5571     for (int i = RUs.size() - 1; i >= 0; --i) {
5572       bool Selected = true;
5573       for (auto &pair : RUs[i].MaxLocalUsers) {
5574         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5575         if (pair.second > TargetNumRegisters)
5576           Selected = false;
5577       }
5578       if (Selected) {
5579         MaxVF = VFs[i];
5580         break;
5581       }
5582     }
5583     if (ElementCount MinVF =
5584             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5585       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5586         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5587                           << ") with target's minimum: " << MinVF << '\n');
5588         MaxVF = MinVF;
5589       }
5590     }
5591   }
5592   return MaxVF;
5593 }
5594 
5595 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5596   if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5597     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5598     auto Min = Attr.getVScaleRangeMin();
5599     auto Max = Attr.getVScaleRangeMax();
5600     if (Max && Min == Max)
5601       return Max;
5602   }
5603 
5604   return TTI.getVScaleForTuning();
5605 }
5606 
5607 bool LoopVectorizationCostModel::isMoreProfitable(
5608     const VectorizationFactor &A, const VectorizationFactor &B) const {
5609   InstructionCost CostA = A.Cost;
5610   InstructionCost CostB = B.Cost;
5611 
5612   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5613 
5614   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5615       MaxTripCount) {
5616     // If we are folding the tail and the trip count is a known (possibly small)
5617     // constant, the trip count will be rounded up to an integer number of
5618     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5619     // which we compare directly. When not folding the tail, the total cost will
5620     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5621     // approximated with the per-lane cost below instead of using the tripcount
5622     // as here.
5623     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5624     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5625     return RTCostA < RTCostB;
5626   }
5627 
5628   // Improve estimate for the vector width if it is scalable.
5629   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5630   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5631   if (Optional<unsigned> VScale = getVScaleForTuning()) {
5632     if (A.Width.isScalable())
5633       EstimatedWidthA *= VScale.getValue();
5634     if (B.Width.isScalable())
5635       EstimatedWidthB *= VScale.getValue();
5636   }
5637 
5638   // Assume vscale may be larger than 1 (or the value being tuned for),
5639   // so that scalable vectorization is slightly favorable over fixed-width
5640   // vectorization.
5641   if (A.Width.isScalable() && !B.Width.isScalable())
5642     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5643 
5644   // To avoid the need for FP division:
5645   //      (CostA / A.Width) < (CostB / B.Width)
5646   // <=>  (CostA * B.Width) < (CostB * A.Width)
5647   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5648 }
5649 
5650 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5651     const ElementCountSet &VFCandidates) {
5652   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5653   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5654   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5655   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5656          "Expected Scalar VF to be a candidate");
5657 
5658   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
5659   VectorizationFactor ChosenFactor = ScalarCost;
5660 
5661   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5662   if (ForceVectorization && VFCandidates.size() > 1) {
5663     // Ignore scalar width, because the user explicitly wants vectorization.
5664     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5665     // evaluation.
5666     ChosenFactor.Cost = InstructionCost::getMax();
5667   }
5668 
5669   SmallVector<InstructionVFPair> InvalidCosts;
5670   for (const auto &i : VFCandidates) {
5671     // The cost for scalar VF=1 is already calculated, so ignore it.
5672     if (i.isScalar())
5673       continue;
5674 
5675     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5676     VectorizationFactor Candidate(i, C.first);
5677 
5678 #ifndef NDEBUG
5679     unsigned AssumedMinimumVscale = 1;
5680     if (Optional<unsigned> VScale = getVScaleForTuning())
5681       AssumedMinimumVscale = VScale.getValue();
5682     unsigned Width =
5683         Candidate.Width.isScalable()
5684             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5685             : Candidate.Width.getFixedValue();
5686     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5687                       << " costs: " << (Candidate.Cost / Width));
5688     if (i.isScalable())
5689       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5690                         << AssumedMinimumVscale << ")");
5691     LLVM_DEBUG(dbgs() << ".\n");
5692 #endif
5693 
5694     if (!C.second && !ForceVectorization) {
5695       LLVM_DEBUG(
5696           dbgs() << "LV: Not considering vector loop of width " << i
5697                  << " because it will not generate any vector instructions.\n");
5698       continue;
5699     }
5700 
5701     // If profitable add it to ProfitableVF list.
5702     if (isMoreProfitable(Candidate, ScalarCost))
5703       ProfitableVFs.push_back(Candidate);
5704 
5705     if (isMoreProfitable(Candidate, ChosenFactor))
5706       ChosenFactor = Candidate;
5707   }
5708 
5709   // Emit a report of VFs with invalid costs in the loop.
5710   if (!InvalidCosts.empty()) {
5711     // Group the remarks per instruction, keeping the instruction order from
5712     // InvalidCosts.
5713     std::map<Instruction *, unsigned> Numbering;
5714     unsigned I = 0;
5715     for (auto &Pair : InvalidCosts)
5716       if (!Numbering.count(Pair.first))
5717         Numbering[Pair.first] = I++;
5718 
5719     // Sort the list, first on instruction(number) then on VF.
5720     llvm::sort(InvalidCosts,
5721                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5722                  if (Numbering[A.first] != Numbering[B.first])
5723                    return Numbering[A.first] < Numbering[B.first];
5724                  ElementCountComparator ECC;
5725                  return ECC(A.second, B.second);
5726                });
5727 
5728     // For a list of ordered instruction-vf pairs:
5729     //   [(load, vf1), (load, vf2), (store, vf1)]
5730     // Group the instructions together to emit separate remarks for:
5731     //   load  (vf1, vf2)
5732     //   store (vf1)
5733     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5734     auto Subset = ArrayRef<InstructionVFPair>();
5735     do {
5736       if (Subset.empty())
5737         Subset = Tail.take_front(1);
5738 
5739       Instruction *I = Subset.front().first;
5740 
5741       // If the next instruction is different, or if there are no other pairs,
5742       // emit a remark for the collated subset. e.g.
5743       //   [(load, vf1), (load, vf2))]
5744       // to emit:
5745       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5746       if (Subset == Tail || Tail[Subset.size()].first != I) {
5747         std::string OutString;
5748         raw_string_ostream OS(OutString);
5749         assert(!Subset.empty() && "Unexpected empty range");
5750         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5751         for (auto &Pair : Subset)
5752           OS << (Pair.second == Subset.front().second ? "" : ", ")
5753              << Pair.second;
5754         OS << "):";
5755         if (auto *CI = dyn_cast<CallInst>(I))
5756           OS << " call to " << CI->getCalledFunction()->getName();
5757         else
5758           OS << " " << I->getOpcodeName();
5759         OS.flush();
5760         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5761         Tail = Tail.drop_front(Subset.size());
5762         Subset = {};
5763       } else
5764         // Grow the subset by one element
5765         Subset = Tail.take_front(Subset.size() + 1);
5766     } while (!Tail.empty());
5767   }
5768 
5769   if (!EnableCondStoresVectorization && NumPredStores) {
5770     reportVectorizationFailure("There are conditional stores.",
5771         "store that is conditionally executed prevents vectorization",
5772         "ConditionalStore", ORE, TheLoop);
5773     ChosenFactor = ScalarCost;
5774   }
5775 
5776   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5777                  ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
5778              << "LV: Vectorization seems to be not beneficial, "
5779              << "but was forced by a user.\n");
5780   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5781   return ChosenFactor;
5782 }
5783 
5784 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5785     const Loop &L, ElementCount VF) const {
5786   // Cross iteration phis such as reductions need special handling and are
5787   // currently unsupported.
5788   if (any_of(L.getHeader()->phis(),
5789              [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); }))
5790     return false;
5791 
5792   // Phis with uses outside of the loop require special handling and are
5793   // currently unsupported.
5794   for (auto &Entry : Legal->getInductionVars()) {
5795     // Look for uses of the value of the induction at the last iteration.
5796     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5797     for (User *U : PostInc->users())
5798       if (!L.contains(cast<Instruction>(U)))
5799         return false;
5800     // Look for uses of penultimate value of the induction.
5801     for (User *U : Entry.first->users())
5802       if (!L.contains(cast<Instruction>(U)))
5803         return false;
5804   }
5805 
5806   // Induction variables that are widened require special handling that is
5807   // currently not supported.
5808   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5809         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5810                  this->isProfitableToScalarize(Entry.first, VF));
5811       }))
5812     return false;
5813 
5814   // Epilogue vectorization code has not been auditted to ensure it handles
5815   // non-latch exits properly.  It may be fine, but it needs auditted and
5816   // tested.
5817   if (L.getExitingBlock() != L.getLoopLatch())
5818     return false;
5819 
5820   return true;
5821 }
5822 
5823 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5824     const ElementCount VF) const {
5825   // FIXME: We need a much better cost-model to take different parameters such
5826   // as register pressure, code size increase and cost of extra branches into
5827   // account. For now we apply a very crude heuristic and only consider loops
5828   // with vectorization factors larger than a certain value.
5829   // We also consider epilogue vectorization unprofitable for targets that don't
5830   // consider interleaving beneficial (eg. MVE).
5831   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5832     return false;
5833   // FIXME: We should consider changing the threshold for scalable
5834   // vectors to take VScaleForTuning into account.
5835   if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
5836     return true;
5837   return false;
5838 }
5839 
5840 VectorizationFactor
5841 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5842     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5843   VectorizationFactor Result = VectorizationFactor::Disabled();
5844   if (!EnableEpilogueVectorization) {
5845     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5846     return Result;
5847   }
5848 
5849   if (!isScalarEpilogueAllowed()) {
5850     LLVM_DEBUG(
5851         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5852                   "allowed.\n";);
5853     return Result;
5854   }
5855 
5856   // Not really a cost consideration, but check for unsupported cases here to
5857   // simplify the logic.
5858   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5859     LLVM_DEBUG(
5860         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5861                   "not a supported candidate.\n";);
5862     return Result;
5863   }
5864 
5865   if (EpilogueVectorizationForceVF > 1) {
5866     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5867     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5868     if (LVP.hasPlanWithVF(ForcedEC))
5869       return {ForcedEC, 0};
5870     else {
5871       LLVM_DEBUG(
5872           dbgs()
5873               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5874       return Result;
5875     }
5876   }
5877 
5878   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5879       TheLoop->getHeader()->getParent()->hasMinSize()) {
5880     LLVM_DEBUG(
5881         dbgs()
5882             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5883     return Result;
5884   }
5885 
5886   if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
5887     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5888                          "this loop\n");
5889     return Result;
5890   }
5891 
5892   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5893   // the main loop handles 8 lanes per iteration. We could still benefit from
5894   // vectorizing the epilogue loop with VF=4.
5895   ElementCount EstimatedRuntimeVF = MainLoopVF;
5896   if (MainLoopVF.isScalable()) {
5897     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5898     if (Optional<unsigned> VScale = getVScaleForTuning())
5899       EstimatedRuntimeVF *= VScale.getValue();
5900   }
5901 
5902   for (auto &NextVF : ProfitableVFs)
5903     if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5904           ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
5905          ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
5906         (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
5907         LVP.hasPlanWithVF(NextVF.Width))
5908       Result = NextVF;
5909 
5910   if (Result != VectorizationFactor::Disabled())
5911     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5912                       << Result.Width << "\n";);
5913   return Result;
5914 }
5915 
5916 std::pair<unsigned, unsigned>
5917 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5918   unsigned MinWidth = -1U;
5919   unsigned MaxWidth = 8;
5920   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5921   // For in-loop reductions, no element types are added to ElementTypesInLoop
5922   // if there are no loads/stores in the loop. In this case, check through the
5923   // reduction variables to determine the maximum width.
5924   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5925     // Reset MaxWidth so that we can find the smallest type used by recurrences
5926     // in the loop.
5927     MaxWidth = -1U;
5928     for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
5929       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5930       // When finding the min width used by the recurrence we need to account
5931       // for casts on the input operands of the recurrence.
5932       MaxWidth = std::min<unsigned>(
5933           MaxWidth, std::min<unsigned>(
5934                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5935                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5936     }
5937   } else {
5938     for (Type *T : ElementTypesInLoop) {
5939       MinWidth = std::min<unsigned>(
5940           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5941       MaxWidth = std::max<unsigned>(
5942           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5943     }
5944   }
5945   return {MinWidth, MaxWidth};
5946 }
5947 
5948 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5949   ElementTypesInLoop.clear();
5950   // For each block.
5951   for (BasicBlock *BB : TheLoop->blocks()) {
5952     // For each instruction in the loop.
5953     for (Instruction &I : BB->instructionsWithoutDebug()) {
5954       Type *T = I.getType();
5955 
5956       // Skip ignored values.
5957       if (ValuesToIgnore.count(&I))
5958         continue;
5959 
5960       // Only examine Loads, Stores and PHINodes.
5961       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5962         continue;
5963 
5964       // Examine PHI nodes that are reduction variables. Update the type to
5965       // account for the recurrence type.
5966       if (auto *PN = dyn_cast<PHINode>(&I)) {
5967         if (!Legal->isReductionVariable(PN))
5968           continue;
5969         const RecurrenceDescriptor &RdxDesc =
5970             Legal->getReductionVars().find(PN)->second;
5971         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5972             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5973                                       RdxDesc.getRecurrenceType(),
5974                                       TargetTransformInfo::ReductionFlags()))
5975           continue;
5976         T = RdxDesc.getRecurrenceType();
5977       }
5978 
5979       // Examine the stored values.
5980       if (auto *ST = dyn_cast<StoreInst>(&I))
5981         T = ST->getValueOperand()->getType();
5982 
5983       assert(T->isSized() &&
5984              "Expected the load/store/recurrence type to be sized");
5985 
5986       ElementTypesInLoop.insert(T);
5987     }
5988   }
5989 }
5990 
5991 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5992                                                            unsigned LoopCost) {
5993   // -- The interleave heuristics --
5994   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5995   // There are many micro-architectural considerations that we can't predict
5996   // at this level. For example, frontend pressure (on decode or fetch) due to
5997   // code size, or the number and capabilities of the execution ports.
5998   //
5999   // We use the following heuristics to select the interleave count:
6000   // 1. If the code has reductions, then we interleave to break the cross
6001   // iteration dependency.
6002   // 2. If the loop is really small, then we interleave to reduce the loop
6003   // overhead.
6004   // 3. We don't interleave if we think that we will spill registers to memory
6005   // due to the increased register pressure.
6006 
6007   if (!isScalarEpilogueAllowed())
6008     return 1;
6009 
6010   // We used the distance for the interleave count.
6011   if (Legal->getMaxSafeDepDistBytes() != -1U)
6012     return 1;
6013 
6014   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
6015   const bool HasReductions = !Legal->getReductionVars().empty();
6016   // Do not interleave loops with a relatively small known or estimated trip
6017   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
6018   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
6019   // because with the above conditions interleaving can expose ILP and break
6020   // cross iteration dependences for reductions.
6021   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
6022       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
6023     return 1;
6024 
6025   RegisterUsage R = calculateRegisterUsage({VF})[0];
6026   // We divide by these constants so assume that we have at least one
6027   // instruction that uses at least one register.
6028   for (auto& pair : R.MaxLocalUsers) {
6029     pair.second = std::max(pair.second, 1U);
6030   }
6031 
6032   // We calculate the interleave count using the following formula.
6033   // Subtract the number of loop invariants from the number of available
6034   // registers. These registers are used by all of the interleaved instances.
6035   // Next, divide the remaining registers by the number of registers that is
6036   // required by the loop, in order to estimate how many parallel instances
6037   // fit without causing spills. All of this is rounded down if necessary to be
6038   // a power of two. We want power of two interleave count to simplify any
6039   // addressing operations or alignment considerations.
6040   // We also want power of two interleave counts to ensure that the induction
6041   // variable of the vector loop wraps to zero, when tail is folded by masking;
6042   // this currently happens when OptForSize, in which case IC is set to 1 above.
6043   unsigned IC = UINT_MAX;
6044 
6045   for (auto& pair : R.MaxLocalUsers) {
6046     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6047     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6048                       << " registers of "
6049                       << TTI.getRegisterClassName(pair.first) << " register class\n");
6050     if (VF.isScalar()) {
6051       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6052         TargetNumRegisters = ForceTargetNumScalarRegs;
6053     } else {
6054       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6055         TargetNumRegisters = ForceTargetNumVectorRegs;
6056     }
6057     unsigned MaxLocalUsers = pair.second;
6058     unsigned LoopInvariantRegs = 0;
6059     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6060       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6061 
6062     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6063     // Don't count the induction variable as interleaved.
6064     if (EnableIndVarRegisterHeur) {
6065       TmpIC =
6066           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6067                         std::max(1U, (MaxLocalUsers - 1)));
6068     }
6069 
6070     IC = std::min(IC, TmpIC);
6071   }
6072 
6073   // Clamp the interleave ranges to reasonable counts.
6074   unsigned MaxInterleaveCount =
6075       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6076 
6077   // Check if the user has overridden the max.
6078   if (VF.isScalar()) {
6079     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6080       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6081   } else {
6082     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6083       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6084   }
6085 
6086   // If trip count is known or estimated compile time constant, limit the
6087   // interleave count to be less than the trip count divided by VF, provided it
6088   // is at least 1.
6089   //
6090   // For scalable vectors we can't know if interleaving is beneficial. It may
6091   // not be beneficial for small loops if none of the lanes in the second vector
6092   // iterations is enabled. However, for larger loops, there is likely to be a
6093   // similar benefit as for fixed-width vectors. For now, we choose to leave
6094   // the InterleaveCount as if vscale is '1', although if some information about
6095   // the vector is known (e.g. min vector size), we can make a better decision.
6096   if (BestKnownTC) {
6097     MaxInterleaveCount =
6098         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6099     // Make sure MaxInterleaveCount is greater than 0.
6100     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6101   }
6102 
6103   assert(MaxInterleaveCount > 0 &&
6104          "Maximum interleave count must be greater than 0");
6105 
6106   // Clamp the calculated IC to be between the 1 and the max interleave count
6107   // that the target and trip count allows.
6108   if (IC > MaxInterleaveCount)
6109     IC = MaxInterleaveCount;
6110   else
6111     // Make sure IC is greater than 0.
6112     IC = std::max(1u, IC);
6113 
6114   assert(IC > 0 && "Interleave count must be greater than 0.");
6115 
6116   // If we did not calculate the cost for VF (because the user selected the VF)
6117   // then we calculate the cost of VF here.
6118   if (LoopCost == 0) {
6119     InstructionCost C = expectedCost(VF).first;
6120     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
6121     LoopCost = *C.getValue();
6122   }
6123 
6124   assert(LoopCost && "Non-zero loop cost expected");
6125 
6126   // Interleave if we vectorized this loop and there is a reduction that could
6127   // benefit from interleaving.
6128   if (VF.isVector() && HasReductions) {
6129     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6130     return IC;
6131   }
6132 
6133   // For any scalar loop that either requires runtime checks or predication we
6134   // are better off leaving this to the unroller. Note that if we've already
6135   // vectorized the loop we will have done the runtime check and so interleaving
6136   // won't require further checks.
6137   bool ScalarInterleavingRequiresPredication =
6138       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
6139          return Legal->blockNeedsPredication(BB);
6140        }));
6141   bool ScalarInterleavingRequiresRuntimePointerCheck =
6142       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6143 
6144   // We want to interleave small loops in order to reduce the loop overhead and
6145   // potentially expose ILP opportunities.
6146   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6147                     << "LV: IC is " << IC << '\n'
6148                     << "LV: VF is " << VF << '\n');
6149   const bool AggressivelyInterleaveReductions =
6150       TTI.enableAggressiveInterleaving(HasReductions);
6151   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
6152       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
6153     // We assume that the cost overhead is 1 and we use the cost model
6154     // to estimate the cost of the loop and interleave until the cost of the
6155     // loop overhead is about 5% of the cost of the loop.
6156     unsigned SmallIC =
6157         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6158 
6159     // Interleave until store/load ports (estimated by max interleave count) are
6160     // saturated.
6161     unsigned NumStores = Legal->getNumStores();
6162     unsigned NumLoads = Legal->getNumLoads();
6163     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6164     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6165 
6166     // There is little point in interleaving for reductions containing selects
6167     // and compares when VF=1 since it may just create more overhead than it's
6168     // worth for loops with small trip counts. This is because we still have to
6169     // do the final reduction after the loop.
6170     bool HasSelectCmpReductions =
6171         HasReductions &&
6172         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6173           const RecurrenceDescriptor &RdxDesc = Reduction.second;
6174           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
6175               RdxDesc.getRecurrenceKind());
6176         });
6177     if (HasSelectCmpReductions) {
6178       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
6179       return 1;
6180     }
6181 
6182     // If we have a scalar reduction (vector reductions are already dealt with
6183     // by this point), we can increase the critical path length if the loop
6184     // we're interleaving is inside another loop. For tree-wise reductions
6185     // set the limit to 2, and for ordered reductions it's best to disable
6186     // interleaving entirely.
6187     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6188       bool HasOrderedReductions =
6189           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6190             const RecurrenceDescriptor &RdxDesc = Reduction.second;
6191             return RdxDesc.isOrdered();
6192           });
6193       if (HasOrderedReductions) {
6194         LLVM_DEBUG(
6195             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
6196         return 1;
6197       }
6198 
6199       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6200       SmallIC = std::min(SmallIC, F);
6201       StoresIC = std::min(StoresIC, F);
6202       LoadsIC = std::min(LoadsIC, F);
6203     }
6204 
6205     if (EnableLoadStoreRuntimeInterleave &&
6206         std::max(StoresIC, LoadsIC) > SmallIC) {
6207       LLVM_DEBUG(
6208           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6209       return std::max(StoresIC, LoadsIC);
6210     }
6211 
6212     // If there are scalar reductions and TTI has enabled aggressive
6213     // interleaving for reductions, we will interleave to expose ILP.
6214     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6215         AggressivelyInterleaveReductions) {
6216       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6217       // Interleave no less than SmallIC but not as aggressive as the normal IC
6218       // to satisfy the rare situation when resources are too limited.
6219       return std::max(IC / 2, SmallIC);
6220     } else {
6221       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6222       return SmallIC;
6223     }
6224   }
6225 
6226   // Interleave if this is a large loop (small loops are already dealt with by
6227   // this point) that could benefit from interleaving.
6228   if (AggressivelyInterleaveReductions) {
6229     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6230     return IC;
6231   }
6232 
6233   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6234   return 1;
6235 }
6236 
6237 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6238 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6239   // This function calculates the register usage by measuring the highest number
6240   // of values that are alive at a single location. Obviously, this is a very
6241   // rough estimation. We scan the loop in a topological order in order and
6242   // assign a number to each instruction. We use RPO to ensure that defs are
6243   // met before their users. We assume that each instruction that has in-loop
6244   // users starts an interval. We record every time that an in-loop value is
6245   // used, so we have a list of the first and last occurrences of each
6246   // instruction. Next, we transpose this data structure into a multi map that
6247   // holds the list of intervals that *end* at a specific location. This multi
6248   // map allows us to perform a linear search. We scan the instructions linearly
6249   // and record each time that a new interval starts, by placing it in a set.
6250   // If we find this value in the multi-map then we remove it from the set.
6251   // The max register usage is the maximum size of the set.
6252   // We also search for instructions that are defined outside the loop, but are
6253   // used inside the loop. We need this number separately from the max-interval
6254   // usage number because when we unroll, loop-invariant values do not take
6255   // more register.
6256   LoopBlocksDFS DFS(TheLoop);
6257   DFS.perform(LI);
6258 
6259   RegisterUsage RU;
6260 
6261   // Each 'key' in the map opens a new interval. The values
6262   // of the map are the index of the 'last seen' usage of the
6263   // instruction that is the key.
6264   using IntervalMap = DenseMap<Instruction *, unsigned>;
6265 
6266   // Maps instruction to its index.
6267   SmallVector<Instruction *, 64> IdxToInstr;
6268   // Marks the end of each interval.
6269   IntervalMap EndPoint;
6270   // Saves the list of instruction indices that are used in the loop.
6271   SmallPtrSet<Instruction *, 8> Ends;
6272   // Saves the list of values that are used in the loop but are
6273   // defined outside the loop, such as arguments and constants.
6274   SmallPtrSet<Value *, 8> LoopInvariants;
6275 
6276   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6277     for (Instruction &I : BB->instructionsWithoutDebug()) {
6278       IdxToInstr.push_back(&I);
6279 
6280       // Save the end location of each USE.
6281       for (Value *U : I.operands()) {
6282         auto *Instr = dyn_cast<Instruction>(U);
6283 
6284         // Ignore non-instruction values such as arguments, constants, etc.
6285         if (!Instr)
6286           continue;
6287 
6288         // If this instruction is outside the loop then record it and continue.
6289         if (!TheLoop->contains(Instr)) {
6290           LoopInvariants.insert(Instr);
6291           continue;
6292         }
6293 
6294         // Overwrite previous end points.
6295         EndPoint[Instr] = IdxToInstr.size();
6296         Ends.insert(Instr);
6297       }
6298     }
6299   }
6300 
6301   // Saves the list of intervals that end with the index in 'key'.
6302   using InstrList = SmallVector<Instruction *, 2>;
6303   DenseMap<unsigned, InstrList> TransposeEnds;
6304 
6305   // Transpose the EndPoints to a list of values that end at each index.
6306   for (auto &Interval : EndPoint)
6307     TransposeEnds[Interval.second].push_back(Interval.first);
6308 
6309   SmallPtrSet<Instruction *, 8> OpenIntervals;
6310   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6311   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6312 
6313   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6314 
6315   // A lambda that gets the register usage for the given type and VF.
6316   const auto &TTICapture = TTI;
6317   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6318     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6319       return 0;
6320     InstructionCost::CostType RegUsage =
6321         *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
6322     assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
6323            "Nonsensical values for register usage.");
6324     return RegUsage;
6325   };
6326 
6327   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6328     Instruction *I = IdxToInstr[i];
6329 
6330     // Remove all of the instructions that end at this location.
6331     InstrList &List = TransposeEnds[i];
6332     for (Instruction *ToRemove : List)
6333       OpenIntervals.erase(ToRemove);
6334 
6335     // Ignore instructions that are never used within the loop.
6336     if (!Ends.count(I))
6337       continue;
6338 
6339     // Skip ignored values.
6340     if (ValuesToIgnore.count(I))
6341       continue;
6342 
6343     // For each VF find the maximum usage of registers.
6344     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6345       // Count the number of live intervals.
6346       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6347 
6348       if (VFs[j].isScalar()) {
6349         for (auto Inst : OpenIntervals) {
6350           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6351           if (RegUsage.find(ClassID) == RegUsage.end())
6352             RegUsage[ClassID] = 1;
6353           else
6354             RegUsage[ClassID] += 1;
6355         }
6356       } else {
6357         collectUniformsAndScalars(VFs[j]);
6358         for (auto Inst : OpenIntervals) {
6359           // Skip ignored values for VF > 1.
6360           if (VecValuesToIgnore.count(Inst))
6361             continue;
6362           if (isScalarAfterVectorization(Inst, VFs[j])) {
6363             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6364             if (RegUsage.find(ClassID) == RegUsage.end())
6365               RegUsage[ClassID] = 1;
6366             else
6367               RegUsage[ClassID] += 1;
6368           } else {
6369             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6370             if (RegUsage.find(ClassID) == RegUsage.end())
6371               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6372             else
6373               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6374           }
6375         }
6376       }
6377 
6378       for (auto& pair : RegUsage) {
6379         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6380           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6381         else
6382           MaxUsages[j][pair.first] = pair.second;
6383       }
6384     }
6385 
6386     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6387                       << OpenIntervals.size() << '\n');
6388 
6389     // Add the current instruction to the list of open intervals.
6390     OpenIntervals.insert(I);
6391   }
6392 
6393   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6394     SmallMapVector<unsigned, unsigned, 4> Invariant;
6395 
6396     for (auto Inst : LoopInvariants) {
6397       unsigned Usage =
6398           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6399       unsigned ClassID =
6400           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6401       if (Invariant.find(ClassID) == Invariant.end())
6402         Invariant[ClassID] = Usage;
6403       else
6404         Invariant[ClassID] += Usage;
6405     }
6406 
6407     LLVM_DEBUG({
6408       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6409       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6410              << " item\n";
6411       for (const auto &pair : MaxUsages[i]) {
6412         dbgs() << "LV(REG): RegisterClass: "
6413                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6414                << " registers\n";
6415       }
6416       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6417              << " item\n";
6418       for (const auto &pair : Invariant) {
6419         dbgs() << "LV(REG): RegisterClass: "
6420                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6421                << " registers\n";
6422       }
6423     });
6424 
6425     RU.LoopInvariantRegs = Invariant;
6426     RU.MaxLocalUsers = MaxUsages[i];
6427     RUs[i] = RU;
6428   }
6429 
6430   return RUs;
6431 }
6432 
6433 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
6434                                                            ElementCount VF) {
6435   // TODO: Cost model for emulated masked load/store is completely
6436   // broken. This hack guides the cost model to use an artificially
6437   // high enough value to practically disable vectorization with such
6438   // operations, except where previously deployed legality hack allowed
6439   // using very low cost values. This is to avoid regressions coming simply
6440   // from moving "masked load/store" check from legality to cost model.
6441   // Masked Load/Gather emulation was previously never allowed.
6442   // Limited number of Masked Store/Scatter emulation was allowed.
6443   assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction");
6444   return isa<LoadInst>(I) ||
6445          (isa<StoreInst>(I) &&
6446           NumPredStores > NumberOfStoresToPredicate);
6447 }
6448 
6449 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6450   // If we aren't vectorizing the loop, or if we've already collected the
6451   // instructions to scalarize, there's nothing to do. Collection may already
6452   // have occurred if we have a user-selected VF and are now computing the
6453   // expected cost for interleaving.
6454   if (VF.isScalar() || VF.isZero() ||
6455       InstsToScalarize.find(VF) != InstsToScalarize.end())
6456     return;
6457 
6458   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6459   // not profitable to scalarize any instructions, the presence of VF in the
6460   // map will indicate that we've analyzed it already.
6461   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6462 
6463   // Find all the instructions that are scalar with predication in the loop and
6464   // determine if it would be better to not if-convert the blocks they are in.
6465   // If so, we also record the instructions to scalarize.
6466   for (BasicBlock *BB : TheLoop->blocks()) {
6467     if (!blockNeedsPredicationForAnyReason(BB))
6468       continue;
6469     for (Instruction &I : *BB)
6470       if (isScalarWithPredication(&I, VF)) {
6471         ScalarCostsTy ScalarCosts;
6472         // Do not apply discount if scalable, because that would lead to
6473         // invalid scalarization costs.
6474         // Do not apply discount logic if hacked cost is needed
6475         // for emulated masked memrefs.
6476         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
6477             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6478           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6479         // Remember that BB will remain after vectorization.
6480         PredicatedBBsAfterVectorization.insert(BB);
6481       }
6482   }
6483 }
6484 
6485 int LoopVectorizationCostModel::computePredInstDiscount(
6486     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6487   assert(!isUniformAfterVectorization(PredInst, VF) &&
6488          "Instruction marked uniform-after-vectorization will be predicated");
6489 
6490   // Initialize the discount to zero, meaning that the scalar version and the
6491   // vector version cost the same.
6492   InstructionCost Discount = 0;
6493 
6494   // Holds instructions to analyze. The instructions we visit are mapped in
6495   // ScalarCosts. Those instructions are the ones that would be scalarized if
6496   // we find that the scalar version costs less.
6497   SmallVector<Instruction *, 8> Worklist;
6498 
6499   // Returns true if the given instruction can be scalarized.
6500   auto canBeScalarized = [&](Instruction *I) -> bool {
6501     // We only attempt to scalarize instructions forming a single-use chain
6502     // from the original predicated block that would otherwise be vectorized.
6503     // Although not strictly necessary, we give up on instructions we know will
6504     // already be scalar to avoid traversing chains that are unlikely to be
6505     // beneficial.
6506     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6507         isScalarAfterVectorization(I, VF))
6508       return false;
6509 
6510     // If the instruction is scalar with predication, it will be analyzed
6511     // separately. We ignore it within the context of PredInst.
6512     if (isScalarWithPredication(I, VF))
6513       return false;
6514 
6515     // If any of the instruction's operands are uniform after vectorization,
6516     // the instruction cannot be scalarized. This prevents, for example, a
6517     // masked load from being scalarized.
6518     //
6519     // We assume we will only emit a value for lane zero of an instruction
6520     // marked uniform after vectorization, rather than VF identical values.
6521     // Thus, if we scalarize an instruction that uses a uniform, we would
6522     // create uses of values corresponding to the lanes we aren't emitting code
6523     // for. This behavior can be changed by allowing getScalarValue to clone
6524     // the lane zero values for uniforms rather than asserting.
6525     for (Use &U : I->operands())
6526       if (auto *J = dyn_cast<Instruction>(U.get()))
6527         if (isUniformAfterVectorization(J, VF))
6528           return false;
6529 
6530     // Otherwise, we can scalarize the instruction.
6531     return true;
6532   };
6533 
6534   // Compute the expected cost discount from scalarizing the entire expression
6535   // feeding the predicated instruction. We currently only consider expressions
6536   // that are single-use instruction chains.
6537   Worklist.push_back(PredInst);
6538   while (!Worklist.empty()) {
6539     Instruction *I = Worklist.pop_back_val();
6540 
6541     // If we've already analyzed the instruction, there's nothing to do.
6542     if (ScalarCosts.find(I) != ScalarCosts.end())
6543       continue;
6544 
6545     // Compute the cost of the vector instruction. Note that this cost already
6546     // includes the scalarization overhead of the predicated instruction.
6547     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6548 
6549     // Compute the cost of the scalarized instruction. This cost is the cost of
6550     // the instruction as if it wasn't if-converted and instead remained in the
6551     // predicated block. We will scale this cost by block probability after
6552     // computing the scalarization overhead.
6553     InstructionCost ScalarCost =
6554         VF.getFixedValue() *
6555         getInstructionCost(I, ElementCount::getFixed(1)).first;
6556 
6557     // Compute the scalarization overhead of needed insertelement instructions
6558     // and phi nodes.
6559     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6560       ScalarCost += TTI.getScalarizationOverhead(
6561           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6562           APInt::getAllOnes(VF.getFixedValue()), true, false);
6563       ScalarCost +=
6564           VF.getFixedValue() *
6565           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6566     }
6567 
6568     // Compute the scalarization overhead of needed extractelement
6569     // instructions. For each of the instruction's operands, if the operand can
6570     // be scalarized, add it to the worklist; otherwise, account for the
6571     // overhead.
6572     for (Use &U : I->operands())
6573       if (auto *J = dyn_cast<Instruction>(U.get())) {
6574         assert(VectorType::isValidElementType(J->getType()) &&
6575                "Instruction has non-scalar type");
6576         if (canBeScalarized(J))
6577           Worklist.push_back(J);
6578         else if (needsExtract(J, VF)) {
6579           ScalarCost += TTI.getScalarizationOverhead(
6580               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6581               APInt::getAllOnes(VF.getFixedValue()), false, true);
6582         }
6583       }
6584 
6585     // Scale the total scalar cost by block probability.
6586     ScalarCost /= getReciprocalPredBlockProb();
6587 
6588     // Compute the discount. A non-negative discount means the vector version
6589     // of the instruction costs more, and scalarizing would be beneficial.
6590     Discount += VectorCost - ScalarCost;
6591     ScalarCosts[I] = ScalarCost;
6592   }
6593 
6594   return *Discount.getValue();
6595 }
6596 
6597 LoopVectorizationCostModel::VectorizationCostTy
6598 LoopVectorizationCostModel::expectedCost(
6599     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6600   VectorizationCostTy Cost;
6601 
6602   // For each block.
6603   for (BasicBlock *BB : TheLoop->blocks()) {
6604     VectorizationCostTy BlockCost;
6605 
6606     // For each instruction in the old loop.
6607     for (Instruction &I : BB->instructionsWithoutDebug()) {
6608       // Skip ignored values.
6609       if (ValuesToIgnore.count(&I) ||
6610           (VF.isVector() && VecValuesToIgnore.count(&I)))
6611         continue;
6612 
6613       VectorizationCostTy C = getInstructionCost(&I, VF);
6614 
6615       // Check if we should override the cost.
6616       if (C.first.isValid() &&
6617           ForceTargetInstructionCost.getNumOccurrences() > 0)
6618         C.first = InstructionCost(ForceTargetInstructionCost);
6619 
6620       // Keep a list of instructions with invalid costs.
6621       if (Invalid && !C.first.isValid())
6622         Invalid->emplace_back(&I, VF);
6623 
6624       BlockCost.first += C.first;
6625       BlockCost.second |= C.second;
6626       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6627                         << " for VF " << VF << " For instruction: " << I
6628                         << '\n');
6629     }
6630 
6631     // If we are vectorizing a predicated block, it will have been
6632     // if-converted. This means that the block's instructions (aside from
6633     // stores and instructions that may divide by zero) will now be
6634     // unconditionally executed. For the scalar case, we may not always execute
6635     // the predicated block, if it is an if-else block. Thus, scale the block's
6636     // cost by the probability of executing it. blockNeedsPredication from
6637     // Legal is used so as to not include all blocks in tail folded loops.
6638     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6639       BlockCost.first /= getReciprocalPredBlockProb();
6640 
6641     Cost.first += BlockCost.first;
6642     Cost.second |= BlockCost.second;
6643   }
6644 
6645   return Cost;
6646 }
6647 
6648 /// Gets Address Access SCEV after verifying that the access pattern
6649 /// is loop invariant except the induction variable dependence.
6650 ///
6651 /// This SCEV can be sent to the Target in order to estimate the address
6652 /// calculation cost.
6653 static const SCEV *getAddressAccessSCEV(
6654               Value *Ptr,
6655               LoopVectorizationLegality *Legal,
6656               PredicatedScalarEvolution &PSE,
6657               const Loop *TheLoop) {
6658 
6659   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6660   if (!Gep)
6661     return nullptr;
6662 
6663   // We are looking for a gep with all loop invariant indices except for one
6664   // which should be an induction variable.
6665   auto SE = PSE.getSE();
6666   unsigned NumOperands = Gep->getNumOperands();
6667   for (unsigned i = 1; i < NumOperands; ++i) {
6668     Value *Opd = Gep->getOperand(i);
6669     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6670         !Legal->isInductionVariable(Opd))
6671       return nullptr;
6672   }
6673 
6674   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6675   return PSE.getSCEV(Ptr);
6676 }
6677 
6678 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6679   return Legal->hasStride(I->getOperand(0)) ||
6680          Legal->hasStride(I->getOperand(1));
6681 }
6682 
6683 InstructionCost
6684 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6685                                                         ElementCount VF) {
6686   assert(VF.isVector() &&
6687          "Scalarization cost of instruction implies vectorization.");
6688   if (VF.isScalable())
6689     return InstructionCost::getInvalid();
6690 
6691   Type *ValTy = getLoadStoreType(I);
6692   auto SE = PSE.getSE();
6693 
6694   unsigned AS = getLoadStoreAddressSpace(I);
6695   Value *Ptr = getLoadStorePointerOperand(I);
6696   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6697   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6698   //       that it is being called from this specific place.
6699 
6700   // Figure out whether the access is strided and get the stride value
6701   // if it's known in compile time
6702   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6703 
6704   // Get the cost of the scalar memory instruction and address computation.
6705   InstructionCost Cost =
6706       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6707 
6708   // Don't pass *I here, since it is scalar but will actually be part of a
6709   // vectorized loop where the user of it is a vectorized instruction.
6710   const Align Alignment = getLoadStoreAlignment(I);
6711   Cost += VF.getKnownMinValue() *
6712           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6713                               AS, TTI::TCK_RecipThroughput);
6714 
6715   // Get the overhead of the extractelement and insertelement instructions
6716   // we might create due to scalarization.
6717   Cost += getScalarizationOverhead(I, VF);
6718 
6719   // If we have a predicated load/store, it will need extra i1 extracts and
6720   // conditional branches, but may not be executed for each vector lane. Scale
6721   // the cost by the probability of executing the predicated block.
6722   if (isPredicatedInst(I, VF)) {
6723     Cost /= getReciprocalPredBlockProb();
6724 
6725     // Add the cost of an i1 extract and a branch
6726     auto *Vec_i1Ty =
6727         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6728     Cost += TTI.getScalarizationOverhead(
6729         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6730         /*Insert=*/false, /*Extract=*/true);
6731     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6732 
6733     if (useEmulatedMaskMemRefHack(I, VF))
6734       // Artificially setting to a high enough value to practically disable
6735       // vectorization with such operations.
6736       Cost = 3000000;
6737   }
6738 
6739   return Cost;
6740 }
6741 
6742 InstructionCost
6743 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6744                                                     ElementCount VF) {
6745   Type *ValTy = getLoadStoreType(I);
6746   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6747   Value *Ptr = getLoadStorePointerOperand(I);
6748   unsigned AS = getLoadStoreAddressSpace(I);
6749   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6750   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6751 
6752   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6753          "Stride should be 1 or -1 for consecutive memory access");
6754   const Align Alignment = getLoadStoreAlignment(I);
6755   InstructionCost Cost = 0;
6756   if (Legal->isMaskRequired(I))
6757     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6758                                       CostKind);
6759   else
6760     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6761                                 CostKind, I);
6762 
6763   bool Reverse = ConsecutiveStride < 0;
6764   if (Reverse)
6765     Cost +=
6766         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6767   return Cost;
6768 }
6769 
6770 InstructionCost
6771 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6772                                                 ElementCount VF) {
6773   assert(Legal->isUniformMemOp(*I));
6774 
6775   Type *ValTy = getLoadStoreType(I);
6776   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6777   const Align Alignment = getLoadStoreAlignment(I);
6778   unsigned AS = getLoadStoreAddressSpace(I);
6779   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6780   if (isa<LoadInst>(I)) {
6781     return TTI.getAddressComputationCost(ValTy) +
6782            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6783                                CostKind) +
6784            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6785   }
6786   StoreInst *SI = cast<StoreInst>(I);
6787 
6788   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6789   return TTI.getAddressComputationCost(ValTy) +
6790          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6791                              CostKind) +
6792          (isLoopInvariantStoreValue
6793               ? 0
6794               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6795                                        VF.getKnownMinValue() - 1));
6796 }
6797 
6798 InstructionCost
6799 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6800                                                  ElementCount VF) {
6801   Type *ValTy = getLoadStoreType(I);
6802   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6803   const Align Alignment = getLoadStoreAlignment(I);
6804   const Value *Ptr = getLoadStorePointerOperand(I);
6805 
6806   return TTI.getAddressComputationCost(VectorTy) +
6807          TTI.getGatherScatterOpCost(
6808              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6809              TargetTransformInfo::TCK_RecipThroughput, I);
6810 }
6811 
6812 InstructionCost
6813 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6814                                                    ElementCount VF) {
6815   // TODO: Once we have support for interleaving with scalable vectors
6816   // we can calculate the cost properly here.
6817   if (VF.isScalable())
6818     return InstructionCost::getInvalid();
6819 
6820   Type *ValTy = getLoadStoreType(I);
6821   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6822   unsigned AS = getLoadStoreAddressSpace(I);
6823 
6824   auto Group = getInterleavedAccessGroup(I);
6825   assert(Group && "Fail to get an interleaved access group.");
6826 
6827   unsigned InterleaveFactor = Group->getFactor();
6828   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6829 
6830   // Holds the indices of existing members in the interleaved group.
6831   SmallVector<unsigned, 4> Indices;
6832   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6833     if (Group->getMember(IF))
6834       Indices.push_back(IF);
6835 
6836   // Calculate the cost of the whole interleaved group.
6837   bool UseMaskForGaps =
6838       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6839       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6840   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6841       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6842       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6843 
6844   if (Group->isReverse()) {
6845     // TODO: Add support for reversed masked interleaved access.
6846     assert(!Legal->isMaskRequired(I) &&
6847            "Reverse masked interleaved access not supported.");
6848     Cost +=
6849         Group->getNumMembers() *
6850         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6851   }
6852   return Cost;
6853 }
6854 
6855 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
6856     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6857   using namespace llvm::PatternMatch;
6858   // Early exit for no inloop reductions
6859   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6860     return None;
6861   auto *VectorTy = cast<VectorType>(Ty);
6862 
6863   // We are looking for a pattern of, and finding the minimal acceptable cost:
6864   //  reduce(mul(ext(A), ext(B))) or
6865   //  reduce(mul(A, B)) or
6866   //  reduce(ext(A)) or
6867   //  reduce(A).
6868   // The basic idea is that we walk down the tree to do that, finding the root
6869   // reduction instruction in InLoopReductionImmediateChains. From there we find
6870   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6871   // of the components. If the reduction cost is lower then we return it for the
6872   // reduction instruction and 0 for the other instructions in the pattern. If
6873   // it is not we return an invalid cost specifying the orignal cost method
6874   // should be used.
6875   Instruction *RetI = I;
6876   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6877     if (!RetI->hasOneUser())
6878       return None;
6879     RetI = RetI->user_back();
6880   }
6881   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
6882       RetI->user_back()->getOpcode() == Instruction::Add) {
6883     if (!RetI->hasOneUser())
6884       return None;
6885     RetI = RetI->user_back();
6886   }
6887 
6888   // Test if the found instruction is a reduction, and if not return an invalid
6889   // cost specifying the parent to use the original cost modelling.
6890   if (!InLoopReductionImmediateChains.count(RetI))
6891     return None;
6892 
6893   // Find the reduction this chain is a part of and calculate the basic cost of
6894   // the reduction on its own.
6895   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6896   Instruction *ReductionPhi = LastChain;
6897   while (!isa<PHINode>(ReductionPhi))
6898     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6899 
6900   const RecurrenceDescriptor &RdxDesc =
6901       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6902 
6903   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6904       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6905 
6906   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6907   // normal fmul instruction to the cost of the fadd reduction.
6908   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6909     BaseCost +=
6910         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6911 
6912   // If we're using ordered reductions then we can just return the base cost
6913   // here, since getArithmeticReductionCost calculates the full ordered
6914   // reduction cost when FP reassociation is not allowed.
6915   if (useOrderedReductions(RdxDesc))
6916     return BaseCost;
6917 
6918   // Get the operand that was not the reduction chain and match it to one of the
6919   // patterns, returning the better cost if it is found.
6920   Instruction *RedOp = RetI->getOperand(1) == LastChain
6921                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6922                            : dyn_cast<Instruction>(RetI->getOperand(1));
6923 
6924   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6925 
6926   Instruction *Op0, *Op1;
6927   if (RedOp &&
6928       match(RedOp,
6929             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6930       match(Op0, m_ZExtOrSExt(m_Value())) &&
6931       Op0->getOpcode() == Op1->getOpcode() &&
6932       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6933       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6934       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6935 
6936     // Matched reduce(ext(mul(ext(A), ext(B)))
6937     // Note that the extend opcodes need to all match, or if A==B they will have
6938     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6939     // which is equally fine.
6940     bool IsUnsigned = isa<ZExtInst>(Op0);
6941     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6942     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6943 
6944     InstructionCost ExtCost =
6945         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6946                              TTI::CastContextHint::None, CostKind, Op0);
6947     InstructionCost MulCost =
6948         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6949     InstructionCost Ext2Cost =
6950         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6951                              TTI::CastContextHint::None, CostKind, RedOp);
6952 
6953     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6954         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6955         CostKind);
6956 
6957     if (RedCost.isValid() &&
6958         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6959       return I == RetI ? RedCost : 0;
6960   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6961              !TheLoop->isLoopInvariant(RedOp)) {
6962     // Matched reduce(ext(A))
6963     bool IsUnsigned = isa<ZExtInst>(RedOp);
6964     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6965     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6966         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6967         CostKind);
6968 
6969     InstructionCost ExtCost =
6970         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6971                              TTI::CastContextHint::None, CostKind, RedOp);
6972     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6973       return I == RetI ? RedCost : 0;
6974   } else if (RedOp &&
6975              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6976     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6977         Op0->getOpcode() == Op1->getOpcode() &&
6978         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6979       bool IsUnsigned = isa<ZExtInst>(Op0);
6980       Type *Op0Ty = Op0->getOperand(0)->getType();
6981       Type *Op1Ty = Op1->getOperand(0)->getType();
6982       Type *LargestOpTy =
6983           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6984                                                                     : Op0Ty;
6985       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6986 
6987       // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
6988       // different sizes. We take the largest type as the ext to reduce, and add
6989       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6990       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6991           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6992           TTI::CastContextHint::None, CostKind, Op0);
6993       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6994           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6995           TTI::CastContextHint::None, CostKind, Op1);
6996       InstructionCost MulCost =
6997           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6998 
6999       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7000           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7001           CostKind);
7002       InstructionCost ExtraExtCost = 0;
7003       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
7004         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
7005         ExtraExtCost = TTI.getCastInstrCost(
7006             ExtraExtOp->getOpcode(), ExtType,
7007             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
7008             TTI::CastContextHint::None, CostKind, ExtraExtOp);
7009       }
7010 
7011       if (RedCost.isValid() &&
7012           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
7013         return I == RetI ? RedCost : 0;
7014     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
7015       // Matched reduce(mul())
7016       InstructionCost MulCost =
7017           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7018 
7019       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7020           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
7021           CostKind);
7022 
7023       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
7024         return I == RetI ? RedCost : 0;
7025     }
7026   }
7027 
7028   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
7029 }
7030 
7031 InstructionCost
7032 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
7033                                                      ElementCount VF) {
7034   // Calculate scalar cost only. Vectorization cost should be ready at this
7035   // moment.
7036   if (VF.isScalar()) {
7037     Type *ValTy = getLoadStoreType(I);
7038     const Align Alignment = getLoadStoreAlignment(I);
7039     unsigned AS = getLoadStoreAddressSpace(I);
7040 
7041     return TTI.getAddressComputationCost(ValTy) +
7042            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
7043                                TTI::TCK_RecipThroughput, I);
7044   }
7045   return getWideningCost(I, VF);
7046 }
7047 
7048 LoopVectorizationCostModel::VectorizationCostTy
7049 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7050                                                ElementCount VF) {
7051   // If we know that this instruction will remain uniform, check the cost of
7052   // the scalar version.
7053   if (isUniformAfterVectorization(I, VF))
7054     VF = ElementCount::getFixed(1);
7055 
7056   if (VF.isVector() && isProfitableToScalarize(I, VF))
7057     return VectorizationCostTy(InstsToScalarize[VF][I], false);
7058 
7059   // Forced scalars do not have any scalarization overhead.
7060   auto ForcedScalar = ForcedScalars.find(VF);
7061   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
7062     auto InstSet = ForcedScalar->second;
7063     if (InstSet.count(I))
7064       return VectorizationCostTy(
7065           (getInstructionCost(I, ElementCount::getFixed(1)).first *
7066            VF.getKnownMinValue()),
7067           false);
7068   }
7069 
7070   Type *VectorTy;
7071   InstructionCost C = getInstructionCost(I, VF, VectorTy);
7072 
7073   bool TypeNotScalarized = false;
7074   if (VF.isVector() && VectorTy->isVectorTy()) {
7075     unsigned NumParts = TTI.getNumberOfParts(VectorTy);
7076     if (NumParts)
7077       TypeNotScalarized = NumParts < VF.getKnownMinValue();
7078     else
7079       C = InstructionCost::getInvalid();
7080   }
7081   return VectorizationCostTy(C, TypeNotScalarized);
7082 }
7083 
7084 InstructionCost
7085 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
7086                                                      ElementCount VF) const {
7087 
7088   // There is no mechanism yet to create a scalable scalarization loop,
7089   // so this is currently Invalid.
7090   if (VF.isScalable())
7091     return InstructionCost::getInvalid();
7092 
7093   if (VF.isScalar())
7094     return 0;
7095 
7096   InstructionCost Cost = 0;
7097   Type *RetTy = ToVectorTy(I->getType(), VF);
7098   if (!RetTy->isVoidTy() &&
7099       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
7100     Cost += TTI.getScalarizationOverhead(
7101         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
7102         false);
7103 
7104   // Some targets keep addresses scalar.
7105   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
7106     return Cost;
7107 
7108   // Some targets support efficient element stores.
7109   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
7110     return Cost;
7111 
7112   // Collect operands to consider.
7113   CallInst *CI = dyn_cast<CallInst>(I);
7114   Instruction::op_range Ops = CI ? CI->args() : I->operands();
7115 
7116   // Skip operands that do not require extraction/scalarization and do not incur
7117   // any overhead.
7118   SmallVector<Type *> Tys;
7119   for (auto *V : filterExtractingOperands(Ops, VF))
7120     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
7121   return Cost + TTI.getOperandsScalarizationOverhead(
7122                     filterExtractingOperands(Ops, VF), Tys);
7123 }
7124 
7125 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
7126   if (VF.isScalar())
7127     return;
7128   NumPredStores = 0;
7129   for (BasicBlock *BB : TheLoop->blocks()) {
7130     // For each instruction in the old loop.
7131     for (Instruction &I : *BB) {
7132       Value *Ptr =  getLoadStorePointerOperand(&I);
7133       if (!Ptr)
7134         continue;
7135 
7136       // TODO: We should generate better code and update the cost model for
7137       // predicated uniform stores. Today they are treated as any other
7138       // predicated store (see added test cases in
7139       // invariant-store-vectorization.ll).
7140       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
7141         NumPredStores++;
7142 
7143       if (Legal->isUniformMemOp(I)) {
7144         // TODO: Avoid replicating loads and stores instead of
7145         // relying on instcombine to remove them.
7146         // Load: Scalar load + broadcast
7147         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
7148         InstructionCost Cost;
7149         if (isa<StoreInst>(&I) && VF.isScalable() &&
7150             isLegalGatherOrScatter(&I, VF)) {
7151           Cost = getGatherScatterCost(&I, VF);
7152           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
7153         } else {
7154           assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
7155                  "Cannot yet scalarize uniform stores");
7156           Cost = getUniformMemOpCost(&I, VF);
7157           setWideningDecision(&I, VF, CM_Scalarize, Cost);
7158         }
7159         continue;
7160       }
7161 
7162       // We assume that widening is the best solution when possible.
7163       if (memoryInstructionCanBeWidened(&I, VF)) {
7164         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
7165         int ConsecutiveStride = Legal->isConsecutivePtr(
7166             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
7167         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7168                "Expected consecutive stride.");
7169         InstWidening Decision =
7170             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
7171         setWideningDecision(&I, VF, Decision, Cost);
7172         continue;
7173       }
7174 
7175       // Choose between Interleaving, Gather/Scatter or Scalarization.
7176       InstructionCost InterleaveCost = InstructionCost::getInvalid();
7177       unsigned NumAccesses = 1;
7178       if (isAccessInterleaved(&I)) {
7179         auto Group = getInterleavedAccessGroup(&I);
7180         assert(Group && "Fail to get an interleaved access group.");
7181 
7182         // Make one decision for the whole group.
7183         if (getWideningDecision(&I, VF) != CM_Unknown)
7184           continue;
7185 
7186         NumAccesses = Group->getNumMembers();
7187         if (interleavedAccessCanBeWidened(&I, VF))
7188           InterleaveCost = getInterleaveGroupCost(&I, VF);
7189       }
7190 
7191       InstructionCost GatherScatterCost =
7192           isLegalGatherOrScatter(&I, VF)
7193               ? getGatherScatterCost(&I, VF) * NumAccesses
7194               : InstructionCost::getInvalid();
7195 
7196       InstructionCost ScalarizationCost =
7197           getMemInstScalarizationCost(&I, VF) * NumAccesses;
7198 
7199       // Choose better solution for the current VF,
7200       // write down this decision and use it during vectorization.
7201       InstructionCost Cost;
7202       InstWidening Decision;
7203       if (InterleaveCost <= GatherScatterCost &&
7204           InterleaveCost < ScalarizationCost) {
7205         Decision = CM_Interleave;
7206         Cost = InterleaveCost;
7207       } else if (GatherScatterCost < ScalarizationCost) {
7208         Decision = CM_GatherScatter;
7209         Cost = GatherScatterCost;
7210       } else {
7211         Decision = CM_Scalarize;
7212         Cost = ScalarizationCost;
7213       }
7214       // If the instructions belongs to an interleave group, the whole group
7215       // receives the same decision. The whole group receives the cost, but
7216       // the cost will actually be assigned to one instruction.
7217       if (auto Group = getInterleavedAccessGroup(&I))
7218         setWideningDecision(Group, VF, Decision, Cost);
7219       else
7220         setWideningDecision(&I, VF, Decision, Cost);
7221     }
7222   }
7223 
7224   // Make sure that any load of address and any other address computation
7225   // remains scalar unless there is gather/scatter support. This avoids
7226   // inevitable extracts into address registers, and also has the benefit of
7227   // activating LSR more, since that pass can't optimize vectorized
7228   // addresses.
7229   if (TTI.prefersVectorizedAddressing())
7230     return;
7231 
7232   // Start with all scalar pointer uses.
7233   SmallPtrSet<Instruction *, 8> AddrDefs;
7234   for (BasicBlock *BB : TheLoop->blocks())
7235     for (Instruction &I : *BB) {
7236       Instruction *PtrDef =
7237         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7238       if (PtrDef && TheLoop->contains(PtrDef) &&
7239           getWideningDecision(&I, VF) != CM_GatherScatter)
7240         AddrDefs.insert(PtrDef);
7241     }
7242 
7243   // Add all instructions used to generate the addresses.
7244   SmallVector<Instruction *, 4> Worklist;
7245   append_range(Worklist, AddrDefs);
7246   while (!Worklist.empty()) {
7247     Instruction *I = Worklist.pop_back_val();
7248     for (auto &Op : I->operands())
7249       if (auto *InstOp = dyn_cast<Instruction>(Op))
7250         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7251             AddrDefs.insert(InstOp).second)
7252           Worklist.push_back(InstOp);
7253   }
7254 
7255   for (auto *I : AddrDefs) {
7256     if (isa<LoadInst>(I)) {
7257       // Setting the desired widening decision should ideally be handled in
7258       // by cost functions, but since this involves the task of finding out
7259       // if the loaded register is involved in an address computation, it is
7260       // instead changed here when we know this is the case.
7261       InstWidening Decision = getWideningDecision(I, VF);
7262       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7263         // Scalarize a widened load of address.
7264         setWideningDecision(
7265             I, VF, CM_Scalarize,
7266             (VF.getKnownMinValue() *
7267              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7268       else if (auto Group = getInterleavedAccessGroup(I)) {
7269         // Scalarize an interleave group of address loads.
7270         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7271           if (Instruction *Member = Group->getMember(I))
7272             setWideningDecision(
7273                 Member, VF, CM_Scalarize,
7274                 (VF.getKnownMinValue() *
7275                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7276         }
7277       }
7278     } else
7279       // Make sure I gets scalarized and a cost estimate without
7280       // scalarization overhead.
7281       ForcedScalars[VF].insert(I);
7282   }
7283 }
7284 
7285 InstructionCost
7286 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7287                                                Type *&VectorTy) {
7288   Type *RetTy = I->getType();
7289   if (canTruncateToMinimalBitwidth(I, VF))
7290     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7291   auto SE = PSE.getSE();
7292   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7293 
7294   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7295                                                 ElementCount VF) -> bool {
7296     if (VF.isScalar())
7297       return true;
7298 
7299     auto Scalarized = InstsToScalarize.find(VF);
7300     assert(Scalarized != InstsToScalarize.end() &&
7301            "VF not yet analyzed for scalarization profitability");
7302     return !Scalarized->second.count(I) &&
7303            llvm::all_of(I->users(), [&](User *U) {
7304              auto *UI = cast<Instruction>(U);
7305              return !Scalarized->second.count(UI);
7306            });
7307   };
7308   (void) hasSingleCopyAfterVectorization;
7309 
7310   if (isScalarAfterVectorization(I, VF)) {
7311     // With the exception of GEPs and PHIs, after scalarization there should
7312     // only be one copy of the instruction generated in the loop. This is
7313     // because the VF is either 1, or any instructions that need scalarizing
7314     // have already been dealt with by the the time we get here. As a result,
7315     // it means we don't have to multiply the instruction cost by VF.
7316     assert(I->getOpcode() == Instruction::GetElementPtr ||
7317            I->getOpcode() == Instruction::PHI ||
7318            (I->getOpcode() == Instruction::BitCast &&
7319             I->getType()->isPointerTy()) ||
7320            hasSingleCopyAfterVectorization(I, VF));
7321     VectorTy = RetTy;
7322   } else
7323     VectorTy = ToVectorTy(RetTy, VF);
7324 
7325   // TODO: We need to estimate the cost of intrinsic calls.
7326   switch (I->getOpcode()) {
7327   case Instruction::GetElementPtr:
7328     // We mark this instruction as zero-cost because the cost of GEPs in
7329     // vectorized code depends on whether the corresponding memory instruction
7330     // is scalarized or not. Therefore, we handle GEPs with the memory
7331     // instruction cost.
7332     return 0;
7333   case Instruction::Br: {
7334     // In cases of scalarized and predicated instructions, there will be VF
7335     // predicated blocks in the vectorized loop. Each branch around these
7336     // blocks requires also an extract of its vector compare i1 element.
7337     bool ScalarPredicatedBB = false;
7338     BranchInst *BI = cast<BranchInst>(I);
7339     if (VF.isVector() && BI->isConditional() &&
7340         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7341          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7342       ScalarPredicatedBB = true;
7343 
7344     if (ScalarPredicatedBB) {
7345       // Not possible to scalarize scalable vector with predicated instructions.
7346       if (VF.isScalable())
7347         return InstructionCost::getInvalid();
7348       // Return cost for branches around scalarized and predicated blocks.
7349       auto *Vec_i1Ty =
7350           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7351       return (
7352           TTI.getScalarizationOverhead(
7353               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
7354           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7355     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7356       // The back-edge branch will remain, as will all scalar branches.
7357       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7358     else
7359       // This branch will be eliminated by if-conversion.
7360       return 0;
7361     // Note: We currently assume zero cost for an unconditional branch inside
7362     // a predicated block since it will become a fall-through, although we
7363     // may decide in the future to call TTI for all branches.
7364   }
7365   case Instruction::PHI: {
7366     auto *Phi = cast<PHINode>(I);
7367 
7368     // First-order recurrences are replaced by vector shuffles inside the loop.
7369     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7370     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7371       return TTI.getShuffleCost(
7372           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7373           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7374 
7375     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7376     // converted into select instructions. We require N - 1 selects per phi
7377     // node, where N is the number of incoming values.
7378     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7379       return (Phi->getNumIncomingValues() - 1) *
7380              TTI.getCmpSelInstrCost(
7381                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7382                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7383                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7384 
7385     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7386   }
7387   case Instruction::UDiv:
7388   case Instruction::SDiv:
7389   case Instruction::URem:
7390   case Instruction::SRem:
7391     // If we have a predicated instruction, it may not be executed for each
7392     // vector lane. Get the scalarization cost and scale this amount by the
7393     // probability of executing the predicated block. If the instruction is not
7394     // predicated, we fall through to the next case.
7395     if (VF.isVector() && isScalarWithPredication(I, VF)) {
7396       InstructionCost Cost = 0;
7397 
7398       // These instructions have a non-void type, so account for the phi nodes
7399       // that we will create. This cost is likely to be zero. The phi node
7400       // cost, if any, should be scaled by the block probability because it
7401       // models a copy at the end of each predicated block.
7402       Cost += VF.getKnownMinValue() *
7403               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7404 
7405       // The cost of the non-predicated instruction.
7406       Cost += VF.getKnownMinValue() *
7407               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7408 
7409       // The cost of insertelement and extractelement instructions needed for
7410       // scalarization.
7411       Cost += getScalarizationOverhead(I, VF);
7412 
7413       // Scale the cost by the probability of executing the predicated blocks.
7414       // This assumes the predicated block for each vector lane is equally
7415       // likely.
7416       return Cost / getReciprocalPredBlockProb();
7417     }
7418     LLVM_FALLTHROUGH;
7419   case Instruction::Add:
7420   case Instruction::FAdd:
7421   case Instruction::Sub:
7422   case Instruction::FSub:
7423   case Instruction::Mul:
7424   case Instruction::FMul:
7425   case Instruction::FDiv:
7426   case Instruction::FRem:
7427   case Instruction::Shl:
7428   case Instruction::LShr:
7429   case Instruction::AShr:
7430   case Instruction::And:
7431   case Instruction::Or:
7432   case Instruction::Xor: {
7433     // Since we will replace the stride by 1 the multiplication should go away.
7434     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7435       return 0;
7436 
7437     // Detect reduction patterns
7438     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7439       return *RedCost;
7440 
7441     // Certain instructions can be cheaper to vectorize if they have a constant
7442     // second vector operand. One example of this are shifts on x86.
7443     Value *Op2 = I->getOperand(1);
7444     TargetTransformInfo::OperandValueProperties Op2VP;
7445     TargetTransformInfo::OperandValueKind Op2VK =
7446         TTI.getOperandInfo(Op2, Op2VP);
7447     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7448       Op2VK = TargetTransformInfo::OK_UniformValue;
7449 
7450     SmallVector<const Value *, 4> Operands(I->operand_values());
7451     return TTI.getArithmeticInstrCost(
7452         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7453         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7454   }
7455   case Instruction::FNeg: {
7456     return TTI.getArithmeticInstrCost(
7457         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7458         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7459         TargetTransformInfo::OP_None, I->getOperand(0), I);
7460   }
7461   case Instruction::Select: {
7462     SelectInst *SI = cast<SelectInst>(I);
7463     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7464     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7465 
7466     const Value *Op0, *Op1;
7467     using namespace llvm::PatternMatch;
7468     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7469                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7470       // select x, y, false --> x & y
7471       // select x, true, y --> x | y
7472       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7473       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7474       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7475       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7476       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7477               Op1->getType()->getScalarSizeInBits() == 1);
7478 
7479       SmallVector<const Value *, 2> Operands{Op0, Op1};
7480       return TTI.getArithmeticInstrCost(
7481           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7482           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7483     }
7484 
7485     Type *CondTy = SI->getCondition()->getType();
7486     if (!ScalarCond)
7487       CondTy = VectorType::get(CondTy, VF);
7488 
7489     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7490     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7491       Pred = Cmp->getPredicate();
7492     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7493                                   CostKind, I);
7494   }
7495   case Instruction::ICmp:
7496   case Instruction::FCmp: {
7497     Type *ValTy = I->getOperand(0)->getType();
7498     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7499     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7500       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7501     VectorTy = ToVectorTy(ValTy, VF);
7502     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7503                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7504                                   I);
7505   }
7506   case Instruction::Store:
7507   case Instruction::Load: {
7508     ElementCount Width = VF;
7509     if (Width.isVector()) {
7510       InstWidening Decision = getWideningDecision(I, Width);
7511       assert(Decision != CM_Unknown &&
7512              "CM decision should be taken at this point");
7513       if (Decision == CM_Scalarize)
7514         Width = ElementCount::getFixed(1);
7515     }
7516     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7517     return getMemoryInstructionCost(I, VF);
7518   }
7519   case Instruction::BitCast:
7520     if (I->getType()->isPointerTy())
7521       return 0;
7522     LLVM_FALLTHROUGH;
7523   case Instruction::ZExt:
7524   case Instruction::SExt:
7525   case Instruction::FPToUI:
7526   case Instruction::FPToSI:
7527   case Instruction::FPExt:
7528   case Instruction::PtrToInt:
7529   case Instruction::IntToPtr:
7530   case Instruction::SIToFP:
7531   case Instruction::UIToFP:
7532   case Instruction::Trunc:
7533   case Instruction::FPTrunc: {
7534     // Computes the CastContextHint from a Load/Store instruction.
7535     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7536       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7537              "Expected a load or a store!");
7538 
7539       if (VF.isScalar() || !TheLoop->contains(I))
7540         return TTI::CastContextHint::Normal;
7541 
7542       switch (getWideningDecision(I, VF)) {
7543       case LoopVectorizationCostModel::CM_GatherScatter:
7544         return TTI::CastContextHint::GatherScatter;
7545       case LoopVectorizationCostModel::CM_Interleave:
7546         return TTI::CastContextHint::Interleave;
7547       case LoopVectorizationCostModel::CM_Scalarize:
7548       case LoopVectorizationCostModel::CM_Widen:
7549         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7550                                         : TTI::CastContextHint::Normal;
7551       case LoopVectorizationCostModel::CM_Widen_Reverse:
7552         return TTI::CastContextHint::Reversed;
7553       case LoopVectorizationCostModel::CM_Unknown:
7554         llvm_unreachable("Instr did not go through cost modelling?");
7555       }
7556 
7557       llvm_unreachable("Unhandled case!");
7558     };
7559 
7560     unsigned Opcode = I->getOpcode();
7561     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7562     // For Trunc, the context is the only user, which must be a StoreInst.
7563     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7564       if (I->hasOneUse())
7565         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7566           CCH = ComputeCCH(Store);
7567     }
7568     // For Z/Sext, the context is the operand, which must be a LoadInst.
7569     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7570              Opcode == Instruction::FPExt) {
7571       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7572         CCH = ComputeCCH(Load);
7573     }
7574 
7575     // We optimize the truncation of induction variables having constant
7576     // integer steps. The cost of these truncations is the same as the scalar
7577     // operation.
7578     if (isOptimizableIVTruncate(I, VF)) {
7579       auto *Trunc = cast<TruncInst>(I);
7580       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7581                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7582     }
7583 
7584     // Detect reduction patterns
7585     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7586       return *RedCost;
7587 
7588     Type *SrcScalarTy = I->getOperand(0)->getType();
7589     Type *SrcVecTy =
7590         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7591     if (canTruncateToMinimalBitwidth(I, VF)) {
7592       // This cast is going to be shrunk. This may remove the cast or it might
7593       // turn it into slightly different cast. For example, if MinBW == 16,
7594       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7595       //
7596       // Calculate the modified src and dest types.
7597       Type *MinVecTy = VectorTy;
7598       if (Opcode == Instruction::Trunc) {
7599         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7600         VectorTy =
7601             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7602       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7603         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7604         VectorTy =
7605             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7606       }
7607     }
7608 
7609     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7610   }
7611   case Instruction::Call: {
7612     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7613       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7614         return *RedCost;
7615     bool NeedToScalarize;
7616     CallInst *CI = cast<CallInst>(I);
7617     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7618     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7619       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7620       return std::min(CallCost, IntrinsicCost);
7621     }
7622     return CallCost;
7623   }
7624   case Instruction::ExtractValue:
7625     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7626   case Instruction::Alloca:
7627     // We cannot easily widen alloca to a scalable alloca, as
7628     // the result would need to be a vector of pointers.
7629     if (VF.isScalable())
7630       return InstructionCost::getInvalid();
7631     LLVM_FALLTHROUGH;
7632   default:
7633     // This opcode is unknown. Assume that it is the same as 'mul'.
7634     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7635   } // end of switch.
7636 }
7637 
7638 char LoopVectorize::ID = 0;
7639 
7640 static const char lv_name[] = "Loop Vectorization";
7641 
7642 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7643 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7644 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7645 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7646 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7647 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7648 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7649 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7650 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7651 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7652 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7653 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7654 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7655 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7656 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7657 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7658 
7659 namespace llvm {
7660 
7661 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7662 
7663 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7664                               bool VectorizeOnlyWhenForced) {
7665   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7666 }
7667 
7668 } // end namespace llvm
7669 
7670 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7671   // Check if the pointer operand of a load or store instruction is
7672   // consecutive.
7673   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7674     return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
7675   return false;
7676 }
7677 
7678 void LoopVectorizationCostModel::collectValuesToIgnore() {
7679   // Ignore ephemeral values.
7680   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7681 
7682   // Ignore type-promoting instructions we identified during reduction
7683   // detection.
7684   for (auto &Reduction : Legal->getReductionVars()) {
7685     const RecurrenceDescriptor &RedDes = Reduction.second;
7686     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7687     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7688   }
7689   // Ignore type-casting instructions we identified during induction
7690   // detection.
7691   for (auto &Induction : Legal->getInductionVars()) {
7692     const InductionDescriptor &IndDes = Induction.second;
7693     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7694     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7695   }
7696 }
7697 
7698 void LoopVectorizationCostModel::collectInLoopReductions() {
7699   for (auto &Reduction : Legal->getReductionVars()) {
7700     PHINode *Phi = Reduction.first;
7701     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7702 
7703     // We don't collect reductions that are type promoted (yet).
7704     if (RdxDesc.getRecurrenceType() != Phi->getType())
7705       continue;
7706 
7707     // If the target would prefer this reduction to happen "in-loop", then we
7708     // want to record it as such.
7709     unsigned Opcode = RdxDesc.getOpcode();
7710     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7711         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7712                                    TargetTransformInfo::ReductionFlags()))
7713       continue;
7714 
7715     // Check that we can correctly put the reductions into the loop, by
7716     // finding the chain of operations that leads from the phi to the loop
7717     // exit value.
7718     SmallVector<Instruction *, 4> ReductionOperations =
7719         RdxDesc.getReductionOpChain(Phi, TheLoop);
7720     bool InLoop = !ReductionOperations.empty();
7721     if (InLoop) {
7722       InLoopReductionChains[Phi] = ReductionOperations;
7723       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7724       Instruction *LastChain = Phi;
7725       for (auto *I : ReductionOperations) {
7726         InLoopReductionImmediateChains[I] = LastChain;
7727         LastChain = I;
7728       }
7729     }
7730     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7731                       << " reduction for phi: " << *Phi << "\n");
7732   }
7733 }
7734 
7735 // TODO: we could return a pair of values that specify the max VF and
7736 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7737 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7738 // doesn't have a cost model that can choose which plan to execute if
7739 // more than one is generated.
7740 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7741                                  LoopVectorizationCostModel &CM) {
7742   unsigned WidestType;
7743   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7744   return WidestVectorRegBits / WidestType;
7745 }
7746 
7747 VectorizationFactor
7748 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7749   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7750   ElementCount VF = UserVF;
7751   // Outer loop handling: They may require CFG and instruction level
7752   // transformations before even evaluating whether vectorization is profitable.
7753   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7754   // the vectorization pipeline.
7755   if (!OrigLoop->isInnermost()) {
7756     // If the user doesn't provide a vectorization factor, determine a
7757     // reasonable one.
7758     if (UserVF.isZero()) {
7759       VF = ElementCount::getFixed(determineVPlanVF(
7760           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7761               .getFixedSize(),
7762           CM));
7763       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7764 
7765       // Make sure we have a VF > 1 for stress testing.
7766       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7767         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7768                           << "overriding computed VF.\n");
7769         VF = ElementCount::getFixed(4);
7770       }
7771     }
7772     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7773     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7774            "VF needs to be a power of two");
7775     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7776                       << "VF " << VF << " to build VPlans.\n");
7777     buildVPlans(VF, VF);
7778 
7779     // For VPlan build stress testing, we bail out after VPlan construction.
7780     if (VPlanBuildStressTest)
7781       return VectorizationFactor::Disabled();
7782 
7783     return {VF, 0 /*Cost*/};
7784   }
7785 
7786   LLVM_DEBUG(
7787       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7788                 "VPlan-native path.\n");
7789   return VectorizationFactor::Disabled();
7790 }
7791 
7792 Optional<VectorizationFactor>
7793 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7794   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7795   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7796   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7797     return None;
7798 
7799   // Invalidate interleave groups if all blocks of loop will be predicated.
7800   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7801       !useMaskedInterleavedAccesses(*TTI)) {
7802     LLVM_DEBUG(
7803         dbgs()
7804         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7805            "which requires masked-interleaved support.\n");
7806     if (CM.InterleaveInfo.invalidateGroups())
7807       // Invalidating interleave groups also requires invalidating all decisions
7808       // based on them, which includes widening decisions and uniform and scalar
7809       // values.
7810       CM.invalidateCostModelingDecisions();
7811   }
7812 
7813   ElementCount MaxUserVF =
7814       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7815   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7816   if (!UserVF.isZero() && UserVFIsLegal) {
7817     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7818            "VF needs to be a power of two");
7819     // Collect the instructions (and their associated costs) that will be more
7820     // profitable to scalarize.
7821     if (CM.selectUserVectorizationFactor(UserVF)) {
7822       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7823       CM.collectInLoopReductions();
7824       buildVPlansWithVPRecipes(UserVF, UserVF);
7825       LLVM_DEBUG(printPlans(dbgs()));
7826       return {{UserVF, 0}};
7827     } else
7828       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7829                               "InvalidCost", ORE, OrigLoop);
7830   }
7831 
7832   // Populate the set of Vectorization Factor Candidates.
7833   ElementCountSet VFCandidates;
7834   for (auto VF = ElementCount::getFixed(1);
7835        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7836     VFCandidates.insert(VF);
7837   for (auto VF = ElementCount::getScalable(1);
7838        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7839     VFCandidates.insert(VF);
7840 
7841   for (const auto &VF : VFCandidates) {
7842     // Collect Uniform and Scalar instructions after vectorization with VF.
7843     CM.collectUniformsAndScalars(VF);
7844 
7845     // Collect the instructions (and their associated costs) that will be more
7846     // profitable to scalarize.
7847     if (VF.isVector())
7848       CM.collectInstsToScalarize(VF);
7849   }
7850 
7851   CM.collectInLoopReductions();
7852   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7853   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7854 
7855   LLVM_DEBUG(printPlans(dbgs()));
7856   if (!MaxFactors.hasVector())
7857     return VectorizationFactor::Disabled();
7858 
7859   // Select the optimal vectorization factor.
7860   auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
7861 
7862   // Check if it is profitable to vectorize with runtime checks.
7863   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
7864   if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
7865     bool PragmaThresholdReached =
7866         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
7867     bool ThresholdReached =
7868         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
7869     if ((ThresholdReached && !Hints.allowReordering()) ||
7870         PragmaThresholdReached) {
7871       ORE->emit([&]() {
7872         return OptimizationRemarkAnalysisAliasing(
7873                    DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
7874                    OrigLoop->getHeader())
7875                << "loop not vectorized: cannot prove it is safe to reorder "
7876                   "memory operations";
7877       });
7878       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
7879       Hints.emitRemarkWithHints();
7880       return VectorizationFactor::Disabled();
7881     }
7882   }
7883   return SelectedVF;
7884 }
7885 
7886 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7887   assert(count_if(VPlans,
7888                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7889              1 &&
7890          "Best VF has not a single VPlan.");
7891 
7892   for (const VPlanPtr &Plan : VPlans) {
7893     if (Plan->hasVF(VF))
7894       return *Plan.get();
7895   }
7896   llvm_unreachable("No plan found!");
7897 }
7898 
7899 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7900   SmallVector<Metadata *, 4> MDs;
7901   // Reserve first location for self reference to the LoopID metadata node.
7902   MDs.push_back(nullptr);
7903   bool IsUnrollMetadata = false;
7904   MDNode *LoopID = L->getLoopID();
7905   if (LoopID) {
7906     // First find existing loop unrolling disable metadata.
7907     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7908       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7909       if (MD) {
7910         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7911         IsUnrollMetadata =
7912             S && S->getString().startswith("llvm.loop.unroll.disable");
7913       }
7914       MDs.push_back(LoopID->getOperand(i));
7915     }
7916   }
7917 
7918   if (!IsUnrollMetadata) {
7919     // Add runtime unroll disable metadata.
7920     LLVMContext &Context = L->getHeader()->getContext();
7921     SmallVector<Metadata *, 1> DisableOperands;
7922     DisableOperands.push_back(
7923         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7924     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7925     MDs.push_back(DisableNode);
7926     MDNode *NewLoopID = MDNode::get(Context, MDs);
7927     // Set operand 0 to refer to the loop id itself.
7928     NewLoopID->replaceOperandWith(0, NewLoopID);
7929     L->setLoopID(NewLoopID);
7930   }
7931 }
7932 
7933 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7934                                            VPlan &BestVPlan,
7935                                            InnerLoopVectorizer &ILV,
7936                                            DominatorTree *DT) {
7937   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7938                     << '\n');
7939 
7940   // Perform the actual loop transformation.
7941 
7942   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7943   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7944   Value *CanonicalIVStartValue;
7945   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7946       ILV.createVectorizedLoopSkeleton();
7947   ILV.collectPoisonGeneratingRecipes(State);
7948 
7949   ILV.printDebugTracesAtStart();
7950 
7951   //===------------------------------------------------===//
7952   //
7953   // Notice: any optimization or new instruction that go
7954   // into the code below should also be implemented in
7955   // the cost-model.
7956   //
7957   //===------------------------------------------------===//
7958 
7959   // 2. Copy and widen instructions from the old loop into the new loop.
7960   BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7961                              ILV.getOrCreateVectorTripCount(nullptr),
7962                              CanonicalIVStartValue, State);
7963   BestVPlan.execute(&State);
7964 
7965   // Keep all loop hints from the original loop on the vector loop (we'll
7966   // replace the vectorizer-specific hints below).
7967   MDNode *OrigLoopID = OrigLoop->getLoopID();
7968 
7969   Optional<MDNode *> VectorizedLoopID =
7970       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7971                                       LLVMLoopVectorizeFollowupVectorized});
7972 
7973   Loop *L = LI->getLoopFor(State.CFG.PrevBB);
7974   if (VectorizedLoopID.hasValue())
7975     L->setLoopID(VectorizedLoopID.getValue());
7976   else {
7977     // Keep all loop hints from the original loop on the vector loop (we'll
7978     // replace the vectorizer-specific hints below).
7979     if (MDNode *LID = OrigLoop->getLoopID())
7980       L->setLoopID(LID);
7981 
7982     LoopVectorizeHints Hints(L, true, *ORE);
7983     Hints.setAlreadyVectorized();
7984   }
7985   // Disable runtime unrolling when vectorizing the epilogue loop.
7986   if (CanonicalIVStartValue)
7987     AddRuntimeUnrollDisableMetaData(L);
7988 
7989   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7990   //    predication, updating analyses.
7991   ILV.fixVectorizedLoop(State);
7992 
7993   ILV.printDebugTracesAtEnd();
7994 }
7995 
7996 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7997 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7998   for (const auto &Plan : VPlans)
7999     if (PrintVPlansInDotFormat)
8000       Plan->printDOT(O);
8001     else
8002       Plan->print(O);
8003 }
8004 #endif
8005 
8006 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
8007     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
8008 
8009   // We create new control-flow for the vectorized loop, so the original exit
8010   // conditions will be dead after vectorization if it's only used by the
8011   // terminator
8012   SmallVector<BasicBlock*> ExitingBlocks;
8013   OrigLoop->getExitingBlocks(ExitingBlocks);
8014   for (auto *BB : ExitingBlocks) {
8015     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
8016     if (!Cmp || !Cmp->hasOneUse())
8017       continue;
8018 
8019     // TODO: we should introduce a getUniqueExitingBlocks on Loop
8020     if (!DeadInstructions.insert(Cmp).second)
8021       continue;
8022 
8023     // The operands of the icmp is often a dead trunc, used by IndUpdate.
8024     // TODO: can recurse through operands in general
8025     for (Value *Op : Cmp->operands()) {
8026       if (isa<TruncInst>(Op) && Op->hasOneUse())
8027           DeadInstructions.insert(cast<Instruction>(Op));
8028     }
8029   }
8030 
8031   // We create new "steps" for induction variable updates to which the original
8032   // induction variables map. An original update instruction will be dead if
8033   // all its users except the induction variable are dead.
8034   auto *Latch = OrigLoop->getLoopLatch();
8035   for (auto &Induction : Legal->getInductionVars()) {
8036     PHINode *Ind = Induction.first;
8037     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
8038 
8039     // If the tail is to be folded by masking, the primary induction variable,
8040     // if exists, isn't dead: it will be used for masking. Don't kill it.
8041     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
8042       continue;
8043 
8044     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
8045           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
8046         }))
8047       DeadInstructions.insert(IndUpdate);
8048   }
8049 }
8050 
8051 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
8052 
8053 //===--------------------------------------------------------------------===//
8054 // EpilogueVectorizerMainLoop
8055 //===--------------------------------------------------------------------===//
8056 
8057 /// This function is partially responsible for generating the control flow
8058 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8059 std::pair<BasicBlock *, Value *>
8060 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
8061   MDNode *OrigLoopID = OrigLoop->getLoopID();
8062   Loop *Lp = createVectorLoopSkeleton("");
8063 
8064   // Generate the code to check the minimum iteration count of the vector
8065   // epilogue (see below).
8066   EPI.EpilogueIterationCountCheck =
8067       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
8068   EPI.EpilogueIterationCountCheck->setName("iter.check");
8069 
8070   // Generate the code to check any assumptions that we've made for SCEV
8071   // expressions.
8072   EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
8073 
8074   // Generate the code that checks at runtime if arrays overlap. We put the
8075   // checks into a separate block to make the more common case of few elements
8076   // faster.
8077   EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
8078 
8079   // Generate the iteration count check for the main loop, *after* the check
8080   // for the epilogue loop, so that the path-length is shorter for the case
8081   // that goes directly through the vector epilogue. The longer-path length for
8082   // the main loop is compensated for, by the gain from vectorizing the larger
8083   // trip count. Note: the branch will get updated later on when we vectorize
8084   // the epilogue.
8085   EPI.MainLoopIterationCountCheck =
8086       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
8087 
8088   // Generate the induction variable.
8089   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8090   EPI.VectorTripCount = CountRoundDown;
8091   createHeaderBranch(Lp);
8092 
8093   // Skip induction resume value creation here because they will be created in
8094   // the second pass. If we created them here, they wouldn't be used anyway,
8095   // because the vplan in the second pass still contains the inductions from the
8096   // original loop.
8097 
8098   return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
8099 }
8100 
8101 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
8102   LLVM_DEBUG({
8103     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
8104            << "Main Loop VF:" << EPI.MainLoopVF
8105            << ", Main Loop UF:" << EPI.MainLoopUF
8106            << ", Epilogue Loop VF:" << EPI.EpilogueVF
8107            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8108   });
8109 }
8110 
8111 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
8112   DEBUG_WITH_TYPE(VerboseDebug, {
8113     dbgs() << "intermediate fn:\n"
8114            << *OrigLoop->getHeader()->getParent() << "\n";
8115   });
8116 }
8117 
8118 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
8119     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
8120   assert(L && "Expected valid Loop.");
8121   assert(Bypass && "Expected valid bypass basic block.");
8122   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
8123   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
8124   Value *Count = getOrCreateTripCount(L);
8125   // Reuse existing vector loop preheader for TC checks.
8126   // Note that new preheader block is generated for vector loop.
8127   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
8128   IRBuilder<> Builder(TCCheckBlock->getTerminator());
8129 
8130   // Generate code to check if the loop's trip count is less than VF * UF of the
8131   // main vector loop.
8132   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
8133       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8134 
8135   Value *CheckMinIters = Builder.CreateICmp(
8136       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
8137       "min.iters.check");
8138 
8139   if (!ForEpilogue)
8140     TCCheckBlock->setName("vector.main.loop.iter.check");
8141 
8142   // Create new preheader for vector loop.
8143   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
8144                                    DT, LI, nullptr, "vector.ph");
8145 
8146   if (ForEpilogue) {
8147     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
8148                                  DT->getNode(Bypass)->getIDom()) &&
8149            "TC check is expected to dominate Bypass");
8150 
8151     // Update dominator for Bypass & LoopExit.
8152     DT->changeImmediateDominator(Bypass, TCCheckBlock);
8153     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8154       // For loops with multiple exits, there's no edge from the middle block
8155       // to exit blocks (as the epilogue must run) and thus no need to update
8156       // the immediate dominator of the exit blocks.
8157       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
8158 
8159     LoopBypassBlocks.push_back(TCCheckBlock);
8160 
8161     // Save the trip count so we don't have to regenerate it in the
8162     // vec.epilog.iter.check. This is safe to do because the trip count
8163     // generated here dominates the vector epilog iter check.
8164     EPI.TripCount = Count;
8165   }
8166 
8167   ReplaceInstWithInst(
8168       TCCheckBlock->getTerminator(),
8169       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8170 
8171   return TCCheckBlock;
8172 }
8173 
8174 //===--------------------------------------------------------------------===//
8175 // EpilogueVectorizerEpilogueLoop
8176 //===--------------------------------------------------------------------===//
8177 
8178 /// This function is partially responsible for generating the control flow
8179 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8180 std::pair<BasicBlock *, Value *>
8181 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8182   MDNode *OrigLoopID = OrigLoop->getLoopID();
8183   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
8184 
8185   // Now, compare the remaining count and if there aren't enough iterations to
8186   // execute the vectorized epilogue skip to the scalar part.
8187   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
8188   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
8189   LoopVectorPreHeader =
8190       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
8191                  LI, nullptr, "vec.epilog.ph");
8192   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
8193                                           VecEpilogueIterationCountCheck);
8194 
8195   // Adjust the control flow taking the state info from the main loop
8196   // vectorization into account.
8197   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
8198          "expected this to be saved from the previous pass.");
8199   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8200       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8201 
8202   DT->changeImmediateDominator(LoopVectorPreHeader,
8203                                EPI.MainLoopIterationCountCheck);
8204 
8205   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8206       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8207 
8208   if (EPI.SCEVSafetyCheck)
8209     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8210         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8211   if (EPI.MemSafetyCheck)
8212     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8213         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8214 
8215   DT->changeImmediateDominator(
8216       VecEpilogueIterationCountCheck,
8217       VecEpilogueIterationCountCheck->getSinglePredecessor());
8218 
8219   DT->changeImmediateDominator(LoopScalarPreHeader,
8220                                EPI.EpilogueIterationCountCheck);
8221   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8222     // If there is an epilogue which must run, there's no edge from the
8223     // middle block to exit blocks  and thus no need to update the immediate
8224     // dominator of the exit blocks.
8225     DT->changeImmediateDominator(LoopExitBlock,
8226                                  EPI.EpilogueIterationCountCheck);
8227 
8228   // Keep track of bypass blocks, as they feed start values to the induction
8229   // phis in the scalar loop preheader.
8230   if (EPI.SCEVSafetyCheck)
8231     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8232   if (EPI.MemSafetyCheck)
8233     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8234   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8235 
8236   // The vec.epilog.iter.check block may contain Phi nodes from reductions which
8237   // merge control-flow from the latch block and the middle block. Update the
8238   // incoming values here and move the Phi into the preheader.
8239   SmallVector<PHINode *, 4> PhisInBlock;
8240   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
8241     PhisInBlock.push_back(&Phi);
8242 
8243   for (PHINode *Phi : PhisInBlock) {
8244     Phi->replaceIncomingBlockWith(
8245         VecEpilogueIterationCountCheck->getSinglePredecessor(),
8246         VecEpilogueIterationCountCheck);
8247     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
8248     if (EPI.SCEVSafetyCheck)
8249       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
8250     if (EPI.MemSafetyCheck)
8251       Phi->removeIncomingValue(EPI.MemSafetyCheck);
8252     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
8253   }
8254 
8255   // Generate a resume induction for the vector epilogue and put it in the
8256   // vector epilogue preheader
8257   Type *IdxTy = Legal->getWidestInductionType();
8258   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8259                                          LoopVectorPreHeader->getFirstNonPHI());
8260   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8261   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8262                            EPI.MainLoopIterationCountCheck);
8263 
8264   // Generate the induction variable.
8265   createHeaderBranch(Lp);
8266 
8267   // Generate induction resume values. These variables save the new starting
8268   // indexes for the scalar loop. They are used to test if there are any tail
8269   // iterations left once the vector loop has completed.
8270   // Note that when the vectorized epilogue is skipped due to iteration count
8271   // check, then the resume value for the induction variable comes from
8272   // the trip count of the main vector loop, hence passing the AdditionalBypass
8273   // argument.
8274   createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck,
8275                                    EPI.VectorTripCount} /* AdditionalBypass */);
8276 
8277   return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal};
8278 }
8279 
8280 BasicBlock *
8281 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8282     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
8283 
8284   assert(EPI.TripCount &&
8285          "Expected trip count to have been safed in the first pass.");
8286   assert(
8287       (!isa<Instruction>(EPI.TripCount) ||
8288        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8289       "saved trip count does not dominate insertion point.");
8290   Value *TC = EPI.TripCount;
8291   IRBuilder<> Builder(Insert->getTerminator());
8292   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8293 
8294   // Generate code to check if the loop's trip count is less than VF * UF of the
8295   // vector epilogue loop.
8296   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
8297       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8298 
8299   Value *CheckMinIters =
8300       Builder.CreateICmp(P, Count,
8301                          createStepForVF(Builder, Count->getType(),
8302                                          EPI.EpilogueVF, EPI.EpilogueUF),
8303                          "min.epilog.iters.check");
8304 
8305   ReplaceInstWithInst(
8306       Insert->getTerminator(),
8307       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8308 
8309   LoopBypassBlocks.push_back(Insert);
8310   return Insert;
8311 }
8312 
8313 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8314   LLVM_DEBUG({
8315     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8316            << "Epilogue Loop VF:" << EPI.EpilogueVF
8317            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8318   });
8319 }
8320 
8321 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8322   DEBUG_WITH_TYPE(VerboseDebug, {
8323     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8324   });
8325 }
8326 
8327 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8328     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8329   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8330   bool PredicateAtRangeStart = Predicate(Range.Start);
8331 
8332   for (ElementCount TmpVF = Range.Start * 2;
8333        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8334     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8335       Range.End = TmpVF;
8336       break;
8337     }
8338 
8339   return PredicateAtRangeStart;
8340 }
8341 
8342 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8343 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8344 /// of VF's starting at a given VF and extending it as much as possible. Each
8345 /// vectorization decision can potentially shorten this sub-range during
8346 /// buildVPlan().
8347 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8348                                            ElementCount MaxVF) {
8349   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8350   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8351     VFRange SubRange = {VF, MaxVFPlusOne};
8352     VPlans.push_back(buildVPlan(SubRange));
8353     VF = SubRange.End;
8354   }
8355 }
8356 
8357 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8358                                          VPlanPtr &Plan) {
8359   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8360 
8361   // Look for cached value.
8362   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8363   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8364   if (ECEntryIt != EdgeMaskCache.end())
8365     return ECEntryIt->second;
8366 
8367   VPValue *SrcMask = createBlockInMask(Src, Plan);
8368 
8369   // The terminator has to be a branch inst!
8370   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8371   assert(BI && "Unexpected terminator found");
8372 
8373   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8374     return EdgeMaskCache[Edge] = SrcMask;
8375 
8376   // If source is an exiting block, we know the exit edge is dynamically dead
8377   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8378   // adding uses of an otherwise potentially dead instruction.
8379   if (OrigLoop->isLoopExiting(Src))
8380     return EdgeMaskCache[Edge] = SrcMask;
8381 
8382   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8383   assert(EdgeMask && "No Edge Mask found for condition");
8384 
8385   if (BI->getSuccessor(0) != Dst)
8386     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8387 
8388   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8389     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8390     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8391     // The select version does not introduce new UB if SrcMask is false and
8392     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8393     VPValue *False = Plan->getOrAddVPValue(
8394         ConstantInt::getFalse(BI->getCondition()->getType()));
8395     EdgeMask =
8396         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8397   }
8398 
8399   return EdgeMaskCache[Edge] = EdgeMask;
8400 }
8401 
8402 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8403   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8404 
8405   // Look for cached value.
8406   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8407   if (BCEntryIt != BlockMaskCache.end())
8408     return BCEntryIt->second;
8409 
8410   // All-one mask is modelled as no-mask following the convention for masked
8411   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8412   VPValue *BlockMask = nullptr;
8413 
8414   if (OrigLoop->getHeader() == BB) {
8415     if (!CM.blockNeedsPredicationForAnyReason(BB))
8416       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8417 
8418     // Introduce the early-exit compare IV <= BTC to form header block mask.
8419     // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8420     // constructing the desired canonical IV in the header block as its first
8421     // non-phi instructions.
8422     assert(CM.foldTailByMasking() && "must fold the tail");
8423     VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
8424     auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8425     auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8426     HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8427 
8428     VPBuilder::InsertPointGuard Guard(Builder);
8429     Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8430     if (CM.TTI.emitGetActiveLaneMask()) {
8431       VPValue *TC = Plan->getOrCreateTripCount();
8432       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC});
8433     } else {
8434       VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8435       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8436     }
8437     return BlockMaskCache[BB] = BlockMask;
8438   }
8439 
8440   // This is the block mask. We OR all incoming edges.
8441   for (auto *Predecessor : predecessors(BB)) {
8442     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8443     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8444       return BlockMaskCache[BB] = EdgeMask;
8445 
8446     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8447       BlockMask = EdgeMask;
8448       continue;
8449     }
8450 
8451     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8452   }
8453 
8454   return BlockMaskCache[BB] = BlockMask;
8455 }
8456 
8457 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8458                                                 ArrayRef<VPValue *> Operands,
8459                                                 VFRange &Range,
8460                                                 VPlanPtr &Plan) {
8461   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8462          "Must be called with either a load or store");
8463 
8464   auto willWiden = [&](ElementCount VF) -> bool {
8465     if (VF.isScalar())
8466       return false;
8467     LoopVectorizationCostModel::InstWidening Decision =
8468         CM.getWideningDecision(I, VF);
8469     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8470            "CM decision should be taken at this point.");
8471     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8472       return true;
8473     if (CM.isScalarAfterVectorization(I, VF) ||
8474         CM.isProfitableToScalarize(I, VF))
8475       return false;
8476     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8477   };
8478 
8479   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8480     return nullptr;
8481 
8482   VPValue *Mask = nullptr;
8483   if (Legal->isMaskRequired(I))
8484     Mask = createBlockInMask(I->getParent(), Plan);
8485 
8486   // Determine if the pointer operand of the access is either consecutive or
8487   // reverse consecutive.
8488   LoopVectorizationCostModel::InstWidening Decision =
8489       CM.getWideningDecision(I, Range.Start);
8490   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8491   bool Consecutive =
8492       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8493 
8494   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8495     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8496                                               Consecutive, Reverse);
8497 
8498   StoreInst *Store = cast<StoreInst>(I);
8499   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8500                                             Mask, Consecutive, Reverse);
8501 }
8502 
8503 static VPWidenIntOrFpInductionRecipe *
8504 createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc,
8505                            VPValue *Start, const InductionDescriptor &IndDesc,
8506                            LoopVectorizationCostModel &CM, Loop &OrigLoop,
8507                            VFRange &Range) {
8508   // Returns true if an instruction \p I should be scalarized instead of
8509   // vectorized for the chosen vectorization factor.
8510   auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8511     return CM.isScalarAfterVectorization(I, VF) ||
8512            CM.isProfitableToScalarize(I, VF);
8513   };
8514 
8515   bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange(
8516       [&](ElementCount VF) {
8517         // Returns true if we should generate a scalar version of \p IV.
8518         if (ShouldScalarizeInstruction(PhiOrTrunc, VF))
8519           return true;
8520         auto isScalarInst = [&](User *U) -> bool {
8521           auto *I = cast<Instruction>(U);
8522           return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF);
8523         };
8524         return any_of(PhiOrTrunc->users(), isScalarInst);
8525       },
8526       Range);
8527   bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8528       [&](ElementCount VF) {
8529         return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8530       },
8531       Range);
8532   assert(IndDesc.getStartValue() ==
8533          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8534   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8535     return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, TruncI,
8536                                              NeedsScalarIV, !NeedsScalarIVOnly);
8537   }
8538   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8539   return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV,
8540                                            !NeedsScalarIVOnly);
8541 }
8542 
8543 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8544     PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const {
8545 
8546   // Check if this is an integer or fp induction. If so, build the recipe that
8547   // produces its scalar and vector values.
8548   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8549     return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM, *OrigLoop,
8550                                       Range);
8551 
8552   return nullptr;
8553 }
8554 
8555 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8556     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
8557     VPlan &Plan) const {
8558   // Optimize the special case where the source is a constant integer
8559   // induction variable. Notice that we can only optimize the 'trunc' case
8560   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8561   // (c) other casts depend on pointer size.
8562 
8563   // Determine whether \p K is a truncation based on an induction variable that
8564   // can be optimized.
8565   auto isOptimizableIVTruncate =
8566       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8567     return [=](ElementCount VF) -> bool {
8568       return CM.isOptimizableIVTruncate(K, VF);
8569     };
8570   };
8571 
8572   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8573           isOptimizableIVTruncate(I), Range)) {
8574 
8575     auto *Phi = cast<PHINode>(I->getOperand(0));
8576     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8577     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8578     return createWidenInductionRecipe(Phi, I, Start, II, CM, *OrigLoop, Range);
8579   }
8580   return nullptr;
8581 }
8582 
8583 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8584                                                 ArrayRef<VPValue *> Operands,
8585                                                 VPlanPtr &Plan) {
8586   // If all incoming values are equal, the incoming VPValue can be used directly
8587   // instead of creating a new VPBlendRecipe.
8588   VPValue *FirstIncoming = Operands[0];
8589   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8590         return FirstIncoming == Inc;
8591       })) {
8592     return Operands[0];
8593   }
8594 
8595   // We know that all PHIs in non-header blocks are converted into selects, so
8596   // we don't have to worry about the insertion order and we can just use the
8597   // builder. At this point we generate the predication tree. There may be
8598   // duplications since this is a simple recursive scan, but future
8599   // optimizations will clean it up.
8600   SmallVector<VPValue *, 2> OperandsWithMask;
8601   unsigned NumIncoming = Phi->getNumIncomingValues();
8602 
8603   for (unsigned In = 0; In < NumIncoming; In++) {
8604     VPValue *EdgeMask =
8605       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8606     assert((EdgeMask || NumIncoming == 1) &&
8607            "Multiple predecessors with one having a full mask");
8608     OperandsWithMask.push_back(Operands[In]);
8609     if (EdgeMask)
8610       OperandsWithMask.push_back(EdgeMask);
8611   }
8612   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8613 }
8614 
8615 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8616                                                    ArrayRef<VPValue *> Operands,
8617                                                    VFRange &Range) const {
8618 
8619   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8620       [this, CI](ElementCount VF) {
8621         return CM.isScalarWithPredication(CI, VF);
8622       },
8623       Range);
8624 
8625   if (IsPredicated)
8626     return nullptr;
8627 
8628   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8629   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8630              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8631              ID == Intrinsic::pseudoprobe ||
8632              ID == Intrinsic::experimental_noalias_scope_decl))
8633     return nullptr;
8634 
8635   auto willWiden = [&](ElementCount VF) -> bool {
8636     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8637     // The following case may be scalarized depending on the VF.
8638     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8639     // version of the instruction.
8640     // Is it beneficial to perform intrinsic call compared to lib call?
8641     bool NeedToScalarize = false;
8642     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8643     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8644     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8645     return UseVectorIntrinsic || !NeedToScalarize;
8646   };
8647 
8648   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8649     return nullptr;
8650 
8651   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8652   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8653 }
8654 
8655 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8656   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8657          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8658   // Instruction should be widened, unless it is scalar after vectorization,
8659   // scalarization is profitable or it is predicated.
8660   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8661     return CM.isScalarAfterVectorization(I, VF) ||
8662            CM.isProfitableToScalarize(I, VF) ||
8663            CM.isScalarWithPredication(I, VF);
8664   };
8665   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8666                                                              Range);
8667 }
8668 
8669 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8670                                            ArrayRef<VPValue *> Operands) const {
8671   auto IsVectorizableOpcode = [](unsigned Opcode) {
8672     switch (Opcode) {
8673     case Instruction::Add:
8674     case Instruction::And:
8675     case Instruction::AShr:
8676     case Instruction::BitCast:
8677     case Instruction::FAdd:
8678     case Instruction::FCmp:
8679     case Instruction::FDiv:
8680     case Instruction::FMul:
8681     case Instruction::FNeg:
8682     case Instruction::FPExt:
8683     case Instruction::FPToSI:
8684     case Instruction::FPToUI:
8685     case Instruction::FPTrunc:
8686     case Instruction::FRem:
8687     case Instruction::FSub:
8688     case Instruction::ICmp:
8689     case Instruction::IntToPtr:
8690     case Instruction::LShr:
8691     case Instruction::Mul:
8692     case Instruction::Or:
8693     case Instruction::PtrToInt:
8694     case Instruction::SDiv:
8695     case Instruction::Select:
8696     case Instruction::SExt:
8697     case Instruction::Shl:
8698     case Instruction::SIToFP:
8699     case Instruction::SRem:
8700     case Instruction::Sub:
8701     case Instruction::Trunc:
8702     case Instruction::UDiv:
8703     case Instruction::UIToFP:
8704     case Instruction::URem:
8705     case Instruction::Xor:
8706     case Instruction::ZExt:
8707       return true;
8708     }
8709     return false;
8710   };
8711 
8712   if (!IsVectorizableOpcode(I->getOpcode()))
8713     return nullptr;
8714 
8715   // Success: widen this instruction.
8716   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8717 }
8718 
8719 void VPRecipeBuilder::fixHeaderPhis() {
8720   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8721   for (VPHeaderPHIRecipe *R : PhisToFix) {
8722     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8723     VPRecipeBase *IncR =
8724         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8725     R->addOperand(IncR->getVPSingleValue());
8726   }
8727 }
8728 
8729 VPBasicBlock *VPRecipeBuilder::handleReplication(
8730     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8731     VPlanPtr &Plan) {
8732   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8733       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8734       Range);
8735 
8736   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8737       [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },
8738       Range);
8739 
8740   // Even if the instruction is not marked as uniform, there are certain
8741   // intrinsic calls that can be effectively treated as such, so we check for
8742   // them here. Conservatively, we only do this for scalable vectors, since
8743   // for fixed-width VFs we can always fall back on full scalarization.
8744   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8745     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8746     case Intrinsic::assume:
8747     case Intrinsic::lifetime_start:
8748     case Intrinsic::lifetime_end:
8749       // For scalable vectors if one of the operands is variant then we still
8750       // want to mark as uniform, which will generate one instruction for just
8751       // the first lane of the vector. We can't scalarize the call in the same
8752       // way as for fixed-width vectors because we don't know how many lanes
8753       // there are.
8754       //
8755       // The reasons for doing it this way for scalable vectors are:
8756       //   1. For the assume intrinsic generating the instruction for the first
8757       //      lane is still be better than not generating any at all. For
8758       //      example, the input may be a splat across all lanes.
8759       //   2. For the lifetime start/end intrinsics the pointer operand only
8760       //      does anything useful when the input comes from a stack object,
8761       //      which suggests it should always be uniform. For non-stack objects
8762       //      the effect is to poison the object, which still allows us to
8763       //      remove the call.
8764       IsUniform = true;
8765       break;
8766     default:
8767       break;
8768     }
8769   }
8770 
8771   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8772                                        IsUniform, IsPredicated);
8773   setRecipe(I, Recipe);
8774   Plan->addVPValue(I, Recipe);
8775 
8776   // Find if I uses a predicated instruction. If so, it will use its scalar
8777   // value. Avoid hoisting the insert-element which packs the scalar value into
8778   // a vector value, as that happens iff all users use the vector value.
8779   for (VPValue *Op : Recipe->operands()) {
8780     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8781     if (!PredR)
8782       continue;
8783     auto *RepR =
8784         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8785     assert(RepR->isPredicated() &&
8786            "expected Replicate recipe to be predicated");
8787     RepR->setAlsoPack(false);
8788   }
8789 
8790   // Finalize the recipe for Instr, first if it is not predicated.
8791   if (!IsPredicated) {
8792     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8793     VPBB->appendRecipe(Recipe);
8794     return VPBB;
8795   }
8796   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8797 
8798   VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8799   assert(SingleSucc && "VPBB must have a single successor when handling "
8800                        "predicated replication.");
8801   VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8802   // Record predicated instructions for above packing optimizations.
8803   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8804   VPBlockUtils::insertBlockAfter(Region, VPBB);
8805   auto *RegSucc = new VPBasicBlock();
8806   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8807   VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8808   return RegSucc;
8809 }
8810 
8811 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8812                                                       VPRecipeBase *PredRecipe,
8813                                                       VPlanPtr &Plan) {
8814   // Instructions marked for predication are replicated and placed under an
8815   // if-then construct to prevent side-effects.
8816 
8817   // Generate recipes to compute the block mask for this region.
8818   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8819 
8820   // Build the triangular if-then region.
8821   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8822   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8823   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8824   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8825   auto *PHIRecipe = Instr->getType()->isVoidTy()
8826                         ? nullptr
8827                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8828   if (PHIRecipe) {
8829     Plan->removeVPValueFor(Instr);
8830     Plan->addVPValue(Instr, PHIRecipe);
8831   }
8832   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8833   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8834   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8835 
8836   // Note: first set Entry as region entry and then connect successors starting
8837   // from it in order, to propagate the "parent" of each VPBasicBlock.
8838   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8839   VPBlockUtils::connectBlocks(Pred, Exit);
8840 
8841   return Region;
8842 }
8843 
8844 VPRecipeOrVPValueTy
8845 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8846                                         ArrayRef<VPValue *> Operands,
8847                                         VFRange &Range, VPlanPtr &Plan) {
8848   // First, check for specific widening recipes that deal with calls, memory
8849   // operations, inductions and Phi nodes.
8850   if (auto *CI = dyn_cast<CallInst>(Instr))
8851     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8852 
8853   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8854     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8855 
8856   VPRecipeBase *Recipe;
8857   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8858     if (Phi->getParent() != OrigLoop->getHeader())
8859       return tryToBlend(Phi, Operands, Plan);
8860     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8861       return toVPRecipeResult(Recipe);
8862 
8863     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8864     if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
8865       VPValue *StartV = Operands[0];
8866       if (Legal->isReductionVariable(Phi)) {
8867         const RecurrenceDescriptor &RdxDesc =
8868             Legal->getReductionVars().find(Phi)->second;
8869         assert(RdxDesc.getRecurrenceStartValue() ==
8870                Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8871         PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8872                                              CM.isInLoopReduction(Phi),
8873                                              CM.useOrderedReductions(RdxDesc));
8874       } else {
8875         PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8876       }
8877 
8878       // Record the incoming value from the backedge, so we can add the incoming
8879       // value from the backedge after all recipes have been created.
8880       recordRecipeOf(cast<Instruction>(
8881           Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8882       PhisToFix.push_back(PhiRecipe);
8883     } else {
8884       // TODO: record backedge value for remaining pointer induction phis.
8885       assert(Phi->getType()->isPointerTy() &&
8886              "only pointer phis should be handled here");
8887       assert(Legal->getInductionVars().count(Phi) &&
8888              "Not an induction variable");
8889       InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8890       VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
8891       PhiRecipe = new VPWidenPHIRecipe(Phi, Start);
8892     }
8893 
8894     return toVPRecipeResult(PhiRecipe);
8895   }
8896 
8897   if (isa<TruncInst>(Instr) &&
8898       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8899                                                Range, *Plan)))
8900     return toVPRecipeResult(Recipe);
8901 
8902   if (!shouldWiden(Instr, Range))
8903     return nullptr;
8904 
8905   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8906     return toVPRecipeResult(new VPWidenGEPRecipe(
8907         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8908 
8909   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8910     bool InvariantCond =
8911         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8912     return toVPRecipeResult(new VPWidenSelectRecipe(
8913         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8914   }
8915 
8916   return toVPRecipeResult(tryToWiden(Instr, Operands));
8917 }
8918 
8919 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8920                                                         ElementCount MaxVF) {
8921   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8922 
8923   // Collect instructions from the original loop that will become trivially dead
8924   // in the vectorized loop. We don't need to vectorize these instructions. For
8925   // example, original induction update instructions can become dead because we
8926   // separately emit induction "steps" when generating code for the new loop.
8927   // Similarly, we create a new latch condition when setting up the structure
8928   // of the new loop, so the old one can become dead.
8929   SmallPtrSet<Instruction *, 4> DeadInstructions;
8930   collectTriviallyDeadInstructions(DeadInstructions);
8931 
8932   // Add assume instructions we need to drop to DeadInstructions, to prevent
8933   // them from being added to the VPlan.
8934   // TODO: We only need to drop assumes in blocks that get flattend. If the
8935   // control flow is preserved, we should keep them.
8936   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8937   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8938 
8939   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8940   // Dead instructions do not need sinking. Remove them from SinkAfter.
8941   for (Instruction *I : DeadInstructions)
8942     SinkAfter.erase(I);
8943 
8944   // Cannot sink instructions after dead instructions (there won't be any
8945   // recipes for them). Instead, find the first non-dead previous instruction.
8946   for (auto &P : Legal->getSinkAfter()) {
8947     Instruction *SinkTarget = P.second;
8948     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8949     (void)FirstInst;
8950     while (DeadInstructions.contains(SinkTarget)) {
8951       assert(
8952           SinkTarget != FirstInst &&
8953           "Must find a live instruction (at least the one feeding the "
8954           "first-order recurrence PHI) before reaching beginning of the block");
8955       SinkTarget = SinkTarget->getPrevNode();
8956       assert(SinkTarget != P.first &&
8957              "sink source equals target, no sinking required");
8958     }
8959     P.second = SinkTarget;
8960   }
8961 
8962   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8963   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8964     VFRange SubRange = {VF, MaxVFPlusOne};
8965     VPlans.push_back(
8966         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8967     VF = SubRange.End;
8968   }
8969 }
8970 
8971 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a
8972 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a
8973 // BranchOnCount VPInstruction to the latch.
8974 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8975                                   bool HasNUW, bool IsVPlanNative) {
8976   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8977   auto *StartV = Plan.getOrAddVPValue(StartIdx);
8978 
8979   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8980   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8981   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8982   if (IsVPlanNative)
8983     Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
8984   Header->insert(CanonicalIVPHI, Header->begin());
8985 
8986   auto *CanonicalIVIncrement =
8987       new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8988                                : VPInstruction::CanonicalIVIncrement,
8989                         {CanonicalIVPHI}, DL);
8990   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8991 
8992   VPBasicBlock *EB = TopRegion->getExitBasicBlock();
8993   if (IsVPlanNative) {
8994     EB = cast<VPBasicBlock>(EB->getSinglePredecessor());
8995     EB->setCondBit(nullptr);
8996   }
8997   EB->appendRecipe(CanonicalIVIncrement);
8998 
8999   auto *BranchOnCount =
9000       new VPInstruction(VPInstruction::BranchOnCount,
9001                         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
9002   EB->appendRecipe(BranchOnCount);
9003 }
9004 
9005 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
9006     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
9007     const MapVector<Instruction *, Instruction *> &SinkAfter) {
9008 
9009   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
9010 
9011   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
9012 
9013   // ---------------------------------------------------------------------------
9014   // Pre-construction: record ingredients whose recipes we'll need to further
9015   // process after constructing the initial VPlan.
9016   // ---------------------------------------------------------------------------
9017 
9018   // Mark instructions we'll need to sink later and their targets as
9019   // ingredients whose recipe we'll need to record.
9020   for (auto &Entry : SinkAfter) {
9021     RecipeBuilder.recordRecipeOf(Entry.first);
9022     RecipeBuilder.recordRecipeOf(Entry.second);
9023   }
9024   for (auto &Reduction : CM.getInLoopReductionChains()) {
9025     PHINode *Phi = Reduction.first;
9026     RecurKind Kind =
9027         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
9028     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9029 
9030     RecipeBuilder.recordRecipeOf(Phi);
9031     for (auto &R : ReductionOperations) {
9032       RecipeBuilder.recordRecipeOf(R);
9033       // For min/max reducitons, where we have a pair of icmp/select, we also
9034       // need to record the ICmp recipe, so it can be removed later.
9035       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9036              "Only min/max recurrences allowed for inloop reductions");
9037       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
9038         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
9039     }
9040   }
9041 
9042   // For each interleave group which is relevant for this (possibly trimmed)
9043   // Range, add it to the set of groups to be later applied to the VPlan and add
9044   // placeholders for its members' Recipes which we'll be replacing with a
9045   // single VPInterleaveRecipe.
9046   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9047     auto applyIG = [IG, this](ElementCount VF) -> bool {
9048       return (VF.isVector() && // Query is illegal for VF == 1
9049               CM.getWideningDecision(IG->getInsertPos(), VF) ==
9050                   LoopVectorizationCostModel::CM_Interleave);
9051     };
9052     if (!getDecisionAndClampRange(applyIG, Range))
9053       continue;
9054     InterleaveGroups.insert(IG);
9055     for (unsigned i = 0; i < IG->getFactor(); i++)
9056       if (Instruction *Member = IG->getMember(i))
9057         RecipeBuilder.recordRecipeOf(Member);
9058   };
9059 
9060   // ---------------------------------------------------------------------------
9061   // Build initial VPlan: Scan the body of the loop in a topological order to
9062   // visit each basic block after having visited its predecessor basic blocks.
9063   // ---------------------------------------------------------------------------
9064 
9065   // Create initial VPlan skeleton, with separate header and latch blocks.
9066   VPBasicBlock *HeaderVPBB = new VPBasicBlock();
9067   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
9068   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
9069   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
9070   auto Plan = std::make_unique<VPlan>(TopRegion);
9071 
9072   Instruction *DLInst =
9073       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
9074   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
9075                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
9076                         !CM.foldTailByMasking(), false);
9077 
9078   // Scan the body of the loop in a topological order to visit each basic block
9079   // after having visited its predecessor basic blocks.
9080   LoopBlocksDFS DFS(OrigLoop);
9081   DFS.perform(LI);
9082 
9083   VPBasicBlock *VPBB = HeaderVPBB;
9084   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
9085   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9086     // Relevant instructions from basic block BB will be grouped into VPRecipe
9087     // ingredients and fill a new VPBasicBlock.
9088     unsigned VPBBsForBB = 0;
9089     VPBB->setName(BB->getName());
9090     Builder.setInsertPoint(VPBB);
9091 
9092     // Introduce each ingredient into VPlan.
9093     // TODO: Model and preserve debug instrinsics in VPlan.
9094     for (Instruction &I : BB->instructionsWithoutDebug()) {
9095       Instruction *Instr = &I;
9096 
9097       // First filter out irrelevant instructions, to ensure no recipes are
9098       // built for them.
9099       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
9100         continue;
9101 
9102       SmallVector<VPValue *, 4> Operands;
9103       auto *Phi = dyn_cast<PHINode>(Instr);
9104       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
9105         Operands.push_back(Plan->getOrAddVPValue(
9106             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9107       } else {
9108         auto OpRange = Plan->mapToVPValues(Instr->operands());
9109         Operands = {OpRange.begin(), OpRange.end()};
9110       }
9111       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
9112               Instr, Operands, Range, Plan)) {
9113         // If Instr can be simplified to an existing VPValue, use it.
9114         if (RecipeOrValue.is<VPValue *>()) {
9115           auto *VPV = RecipeOrValue.get<VPValue *>();
9116           Plan->addVPValue(Instr, VPV);
9117           // If the re-used value is a recipe, register the recipe for the
9118           // instruction, in case the recipe for Instr needs to be recorded.
9119           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
9120             RecipeBuilder.setRecipe(Instr, R);
9121           continue;
9122         }
9123         // Otherwise, add the new recipe.
9124         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
9125         for (auto *Def : Recipe->definedValues()) {
9126           auto *UV = Def->getUnderlyingValue();
9127           Plan->addVPValue(UV, Def);
9128         }
9129 
9130         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
9131             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
9132           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
9133           // of the header block. That can happen for truncates of induction
9134           // variables. Those recipes are moved to the phi section of the header
9135           // block after applying SinkAfter, which relies on the original
9136           // position of the trunc.
9137           assert(isa<TruncInst>(Instr));
9138           InductionsToMove.push_back(
9139               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
9140         }
9141         RecipeBuilder.setRecipe(Instr, Recipe);
9142         VPBB->appendRecipe(Recipe);
9143         continue;
9144       }
9145 
9146       // Otherwise, if all widening options failed, Instruction is to be
9147       // replicated. This may create a successor for VPBB.
9148       VPBasicBlock *NextVPBB =
9149           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
9150       if (NextVPBB != VPBB) {
9151         VPBB = NextVPBB;
9152         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
9153                                     : "");
9154       }
9155     }
9156 
9157     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
9158     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9159   }
9160 
9161   // Fold the last, empty block into its predecessor.
9162   VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
9163   assert(VPBB && "expected to fold last (empty) block");
9164   // After here, VPBB should not be used.
9165   VPBB = nullptr;
9166 
9167   assert(isa<VPRegionBlock>(Plan->getEntry()) &&
9168          !Plan->getEntry()->getEntryBasicBlock()->empty() &&
9169          "entry block must be set to a VPRegionBlock having a non-empty entry "
9170          "VPBasicBlock");
9171   RecipeBuilder.fixHeaderPhis();
9172 
9173   // ---------------------------------------------------------------------------
9174   // Transform initial VPlan: Apply previously taken decisions, in order, to
9175   // bring the VPlan to its final state.
9176   // ---------------------------------------------------------------------------
9177 
9178   // Apply Sink-After legal constraints.
9179   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
9180     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9181     if (Region && Region->isReplicator()) {
9182       assert(Region->getNumSuccessors() == 1 &&
9183              Region->getNumPredecessors() == 1 && "Expected SESE region!");
9184       assert(R->getParent()->size() == 1 &&
9185              "A recipe in an original replicator region must be the only "
9186              "recipe in its block");
9187       return Region;
9188     }
9189     return nullptr;
9190   };
9191   for (auto &Entry : SinkAfter) {
9192     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9193     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9194 
9195     auto *TargetRegion = GetReplicateRegion(Target);
9196     auto *SinkRegion = GetReplicateRegion(Sink);
9197     if (!SinkRegion) {
9198       // If the sink source is not a replicate region, sink the recipe directly.
9199       if (TargetRegion) {
9200         // The target is in a replication region, make sure to move Sink to
9201         // the block after it, not into the replication region itself.
9202         VPBasicBlock *NextBlock =
9203             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9204         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9205       } else
9206         Sink->moveAfter(Target);
9207       continue;
9208     }
9209 
9210     // The sink source is in a replicate region. Unhook the region from the CFG.
9211     auto *SinkPred = SinkRegion->getSinglePredecessor();
9212     auto *SinkSucc = SinkRegion->getSingleSuccessor();
9213     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9214     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9215     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9216 
9217     if (TargetRegion) {
9218       // The target recipe is also in a replicate region, move the sink region
9219       // after the target region.
9220       auto *TargetSucc = TargetRegion->getSingleSuccessor();
9221       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9222       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9223       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9224     } else {
9225       // The sink source is in a replicate region, we need to move the whole
9226       // replicate region, which should only contain a single recipe in the
9227       // main block.
9228       auto *SplitBlock =
9229           Target->getParent()->splitAt(std::next(Target->getIterator()));
9230 
9231       auto *SplitPred = SplitBlock->getSinglePredecessor();
9232 
9233       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9234       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9235       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9236     }
9237   }
9238 
9239   VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
9240   VPlanTransforms::removeRedundantInductionCasts(*Plan);
9241 
9242   // Now that sink-after is done, move induction recipes for optimized truncates
9243   // to the phi section of the header block.
9244   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
9245     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9246 
9247   // Adjust the recipes for any inloop reductions.
9248   adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan,
9249                              RecipeBuilder, Range.Start);
9250 
9251   // Introduce a recipe to combine the incoming and previous values of a
9252   // first-order recurrence.
9253   for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9254     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9255     if (!RecurPhi)
9256       continue;
9257 
9258     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
9259     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
9260     auto *Region = GetReplicateRegion(PrevRecipe);
9261     if (Region)
9262       InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
9263     if (Region || PrevRecipe->isPhi())
9264       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
9265     else
9266       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
9267 
9268     auto *RecurSplice = cast<VPInstruction>(
9269         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9270                              {RecurPhi, RecurPhi->getBackedgeValue()}));
9271 
9272     RecurPhi->replaceAllUsesWith(RecurSplice);
9273     // Set the first operand of RecurSplice to RecurPhi again, after replacing
9274     // all users.
9275     RecurSplice->setOperand(0, RecurPhi);
9276   }
9277 
9278   // Interleave memory: for each Interleave Group we marked earlier as relevant
9279   // for this VPlan, replace the Recipes widening its memory instructions with a
9280   // single VPInterleaveRecipe at its insertion point.
9281   for (auto IG : InterleaveGroups) {
9282     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9283         RecipeBuilder.getRecipe(IG->getInsertPos()));
9284     SmallVector<VPValue *, 4> StoredValues;
9285     for (unsigned i = 0; i < IG->getFactor(); ++i)
9286       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9287         auto *StoreR =
9288             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9289         StoredValues.push_back(StoreR->getStoredValue());
9290       }
9291 
9292     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9293                                         Recipe->getMask());
9294     VPIG->insertBefore(Recipe);
9295     unsigned J = 0;
9296     for (unsigned i = 0; i < IG->getFactor(); ++i)
9297       if (Instruction *Member = IG->getMember(i)) {
9298         if (!Member->getType()->isVoidTy()) {
9299           VPValue *OriginalV = Plan->getVPValue(Member);
9300           Plan->removeVPValueFor(Member);
9301           Plan->addVPValue(Member, VPIG->getVPValue(J));
9302           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9303           J++;
9304         }
9305         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9306       }
9307   }
9308 
9309   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9310   // in ways that accessing values using original IR values is incorrect.
9311   Plan->disableValue2VPValue();
9312 
9313   VPlanTransforms::sinkScalarOperands(*Plan);
9314   VPlanTransforms::mergeReplicateRegions(*Plan);
9315 
9316   std::string PlanName;
9317   raw_string_ostream RSO(PlanName);
9318   ElementCount VF = Range.Start;
9319   Plan->addVF(VF);
9320   RSO << "Initial VPlan for VF={" << VF;
9321   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9322     Plan->addVF(VF);
9323     RSO << "," << VF;
9324   }
9325   RSO << "},UF>=1";
9326   RSO.flush();
9327   Plan->setName(PlanName);
9328 
9329   // Fold Exit block into its predecessor if possible.
9330   // TODO: Fold block earlier once all VPlan transforms properly maintain a
9331   // VPBasicBlock as exit.
9332   VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit());
9333 
9334   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9335   return Plan;
9336 }
9337 
9338 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9339   // Outer loop handling: They may require CFG and instruction level
9340   // transformations before even evaluating whether vectorization is profitable.
9341   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9342   // the vectorization pipeline.
9343   assert(!OrigLoop->isInnermost());
9344   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9345 
9346   // Create new empty VPlan
9347   auto Plan = std::make_unique<VPlan>();
9348 
9349   // Build hierarchical CFG
9350   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9351   HCFGBuilder.buildHierarchicalCFG();
9352 
9353   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9354        VF *= 2)
9355     Plan->addVF(VF);
9356 
9357   if (EnableVPlanPredication) {
9358     VPlanPredicator VPP(*Plan);
9359     VPP.predicate();
9360 
9361     // Avoid running transformation to recipes until masked code generation in
9362     // VPlan-native path is in place.
9363     return Plan;
9364   }
9365 
9366   SmallPtrSet<Instruction *, 1> DeadInstructions;
9367   VPlanTransforms::VPInstructionsToVPRecipes(
9368       OrigLoop, Plan,
9369       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9370       DeadInstructions, *PSE.getSE());
9371 
9372   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9373                         true, true);
9374   return Plan;
9375 }
9376 
9377 // Adjust the recipes for reductions. For in-loop reductions the chain of
9378 // instructions leading from the loop exit instr to the phi need to be converted
9379 // to reductions, with one operand being vector and the other being the scalar
9380 // reduction chain. For other reductions, a select is introduced between the phi
9381 // and live-out recipes when folding the tail.
9382 void LoopVectorizationPlanner::adjustRecipesForReductions(
9383     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9384     ElementCount MinVF) {
9385   for (auto &Reduction : CM.getInLoopReductionChains()) {
9386     PHINode *Phi = Reduction.first;
9387     const RecurrenceDescriptor &RdxDesc =
9388         Legal->getReductionVars().find(Phi)->second;
9389     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9390 
9391     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9392       continue;
9393 
9394     // ReductionOperations are orders top-down from the phi's use to the
9395     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9396     // which of the two operands will remain scalar and which will be reduced.
9397     // For minmax the chain will be the select instructions.
9398     Instruction *Chain = Phi;
9399     for (Instruction *R : ReductionOperations) {
9400       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9401       RecurKind Kind = RdxDesc.getRecurrenceKind();
9402 
9403       VPValue *ChainOp = Plan->getVPValue(Chain);
9404       unsigned FirstOpId;
9405       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9406              "Only min/max recurrences allowed for inloop reductions");
9407       // Recognize a call to the llvm.fmuladd intrinsic.
9408       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9409       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9410              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9411       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9412         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9413                "Expected to replace a VPWidenSelectSC");
9414         FirstOpId = 1;
9415       } else {
9416         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9417                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9418                "Expected to replace a VPWidenSC");
9419         FirstOpId = 0;
9420       }
9421       unsigned VecOpId =
9422           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9423       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9424 
9425       auto *CondOp = CM.foldTailByMasking()
9426                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9427                          : nullptr;
9428 
9429       if (IsFMulAdd) {
9430         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9431         // need to create an fmul recipe to use as the vector operand for the
9432         // fadd reduction.
9433         VPInstruction *FMulRecipe = new VPInstruction(
9434             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9435         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9436         WidenRecipe->getParent()->insert(FMulRecipe,
9437                                          WidenRecipe->getIterator());
9438         VecOp = FMulRecipe;
9439       }
9440       VPReductionRecipe *RedRecipe =
9441           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9442       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9443       Plan->removeVPValueFor(R);
9444       Plan->addVPValue(R, RedRecipe);
9445       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9446       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9447       WidenRecipe->eraseFromParent();
9448 
9449       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9450         VPRecipeBase *CompareRecipe =
9451             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9452         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9453                "Expected to replace a VPWidenSC");
9454         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9455                "Expected no remaining users");
9456         CompareRecipe->eraseFromParent();
9457       }
9458       Chain = R;
9459     }
9460   }
9461 
9462   // If tail is folded by masking, introduce selects between the phi
9463   // and the live-out instruction of each reduction, at the beginning of the
9464   // dedicated latch block.
9465   if (CM.foldTailByMasking()) {
9466     Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9467     for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9468       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9469       if (!PhiR || PhiR->isInLoop())
9470         continue;
9471       VPValue *Cond =
9472           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9473       VPValue *Red = PhiR->getBackedgeValue();
9474       assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB &&
9475              "reduction recipe must be defined before latch");
9476       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9477     }
9478   }
9479 }
9480 
9481 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9482 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9483                                VPSlotTracker &SlotTracker) const {
9484   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9485   IG->getInsertPos()->printAsOperand(O, false);
9486   O << ", ";
9487   getAddr()->printAsOperand(O, SlotTracker);
9488   VPValue *Mask = getMask();
9489   if (Mask) {
9490     O << ", ";
9491     Mask->printAsOperand(O, SlotTracker);
9492   }
9493 
9494   unsigned OpIdx = 0;
9495   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9496     if (!IG->getMember(i))
9497       continue;
9498     if (getNumStoreOperands() > 0) {
9499       O << "\n" << Indent << "  store ";
9500       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9501       O << " to index " << i;
9502     } else {
9503       O << "\n" << Indent << "  ";
9504       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9505       O << " = load from index " << i;
9506     }
9507     ++OpIdx;
9508   }
9509 }
9510 #endif
9511 
9512 void VPWidenCallRecipe::execute(VPTransformState &State) {
9513   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9514                                   *this, State);
9515 }
9516 
9517 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9518   auto &I = *cast<SelectInst>(getUnderlyingInstr());
9519   State.ILV->setDebugLocFromInst(&I);
9520 
9521   // The condition can be loop invariant  but still defined inside the
9522   // loop. This means that we can't just use the original 'cond' value.
9523   // We have to take the 'vectorized' value and pick the first lane.
9524   // Instcombine will make this a no-op.
9525   auto *InvarCond =
9526       InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
9527 
9528   for (unsigned Part = 0; Part < State.UF; ++Part) {
9529     Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
9530     Value *Op0 = State.get(getOperand(1), Part);
9531     Value *Op1 = State.get(getOperand(2), Part);
9532     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
9533     State.set(this, Sel, Part);
9534     State.ILV->addMetadata(Sel, &I);
9535   }
9536 }
9537 
9538 void VPWidenRecipe::execute(VPTransformState &State) {
9539   auto &I = *cast<Instruction>(getUnderlyingValue());
9540   auto &Builder = State.Builder;
9541   switch (I.getOpcode()) {
9542   case Instruction::Call:
9543   case Instruction::Br:
9544   case Instruction::PHI:
9545   case Instruction::GetElementPtr:
9546   case Instruction::Select:
9547     llvm_unreachable("This instruction is handled by a different recipe.");
9548   case Instruction::UDiv:
9549   case Instruction::SDiv:
9550   case Instruction::SRem:
9551   case Instruction::URem:
9552   case Instruction::Add:
9553   case Instruction::FAdd:
9554   case Instruction::Sub:
9555   case Instruction::FSub:
9556   case Instruction::FNeg:
9557   case Instruction::Mul:
9558   case Instruction::FMul:
9559   case Instruction::FDiv:
9560   case Instruction::FRem:
9561   case Instruction::Shl:
9562   case Instruction::LShr:
9563   case Instruction::AShr:
9564   case Instruction::And:
9565   case Instruction::Or:
9566   case Instruction::Xor: {
9567     // Just widen unops and binops.
9568     State.ILV->setDebugLocFromInst(&I);
9569 
9570     for (unsigned Part = 0; Part < State.UF; ++Part) {
9571       SmallVector<Value *, 2> Ops;
9572       for (VPValue *VPOp : operands())
9573         Ops.push_back(State.get(VPOp, Part));
9574 
9575       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
9576 
9577       if (auto *VecOp = dyn_cast<Instruction>(V)) {
9578         VecOp->copyIRFlags(&I);
9579 
9580         // If the instruction is vectorized and was in a basic block that needed
9581         // predication, we can't propagate poison-generating flags (nuw/nsw,
9582         // exact, etc.). The control flow has been linearized and the
9583         // instruction is no longer guarded by the predicate, which could make
9584         // the flag properties to no longer hold.
9585         if (State.MayGeneratePoisonRecipes.contains(this))
9586           VecOp->dropPoisonGeneratingFlags();
9587       }
9588 
9589       // Use this vector value for all users of the original instruction.
9590       State.set(this, V, Part);
9591       State.ILV->addMetadata(V, &I);
9592     }
9593 
9594     break;
9595   }
9596   case Instruction::ICmp:
9597   case Instruction::FCmp: {
9598     // Widen compares. Generate vector compares.
9599     bool FCmp = (I.getOpcode() == Instruction::FCmp);
9600     auto *Cmp = cast<CmpInst>(&I);
9601     State.ILV->setDebugLocFromInst(Cmp);
9602     for (unsigned Part = 0; Part < State.UF; ++Part) {
9603       Value *A = State.get(getOperand(0), Part);
9604       Value *B = State.get(getOperand(1), Part);
9605       Value *C = nullptr;
9606       if (FCmp) {
9607         // Propagate fast math flags.
9608         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9609         Builder.setFastMathFlags(Cmp->getFastMathFlags());
9610         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
9611       } else {
9612         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
9613       }
9614       State.set(this, C, Part);
9615       State.ILV->addMetadata(C, &I);
9616     }
9617 
9618     break;
9619   }
9620 
9621   case Instruction::ZExt:
9622   case Instruction::SExt:
9623   case Instruction::FPToUI:
9624   case Instruction::FPToSI:
9625   case Instruction::FPExt:
9626   case Instruction::PtrToInt:
9627   case Instruction::IntToPtr:
9628   case Instruction::SIToFP:
9629   case Instruction::UIToFP:
9630   case Instruction::Trunc:
9631   case Instruction::FPTrunc:
9632   case Instruction::BitCast: {
9633     auto *CI = cast<CastInst>(&I);
9634     State.ILV->setDebugLocFromInst(CI);
9635 
9636     /// Vectorize casts.
9637     Type *DestTy = (State.VF.isScalar())
9638                        ? CI->getType()
9639                        : VectorType::get(CI->getType(), State.VF);
9640 
9641     for (unsigned Part = 0; Part < State.UF; ++Part) {
9642       Value *A = State.get(getOperand(0), Part);
9643       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
9644       State.set(this, Cast, Part);
9645       State.ILV->addMetadata(Cast, &I);
9646     }
9647     break;
9648   }
9649   default:
9650     // This instruction is not vectorized by simple widening.
9651     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
9652     llvm_unreachable("Unhandled instruction!");
9653   } // end of switch.
9654 }
9655 
9656 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9657   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
9658   // Construct a vector GEP by widening the operands of the scalar GEP as
9659   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
9660   // results in a vector of pointers when at least one operand of the GEP
9661   // is vector-typed. Thus, to keep the representation compact, we only use
9662   // vector-typed operands for loop-varying values.
9663 
9664   if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
9665     // If we are vectorizing, but the GEP has only loop-invariant operands,
9666     // the GEP we build (by only using vector-typed operands for
9667     // loop-varying values) would be a scalar pointer. Thus, to ensure we
9668     // produce a vector of pointers, we need to either arbitrarily pick an
9669     // operand to broadcast, or broadcast a clone of the original GEP.
9670     // Here, we broadcast a clone of the original.
9671     //
9672     // TODO: If at some point we decide to scalarize instructions having
9673     //       loop-invariant operands, this special case will no longer be
9674     //       required. We would add the scalarization decision to
9675     //       collectLoopScalars() and teach getVectorValue() to broadcast
9676     //       the lane-zero scalar value.
9677     auto *Clone = State.Builder.Insert(GEP->clone());
9678     for (unsigned Part = 0; Part < State.UF; ++Part) {
9679       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
9680       State.set(this, EntryPart, Part);
9681       State.ILV->addMetadata(EntryPart, GEP);
9682     }
9683   } else {
9684     // If the GEP has at least one loop-varying operand, we are sure to
9685     // produce a vector of pointers. But if we are only unrolling, we want
9686     // to produce a scalar GEP for each unroll part. Thus, the GEP we
9687     // produce with the code below will be scalar (if VF == 1) or vector
9688     // (otherwise). Note that for the unroll-only case, we still maintain
9689     // values in the vector mapping with initVector, as we do for other
9690     // instructions.
9691     for (unsigned Part = 0; Part < State.UF; ++Part) {
9692       // The pointer operand of the new GEP. If it's loop-invariant, we
9693       // won't broadcast it.
9694       auto *Ptr = IsPtrLoopInvariant
9695                       ? State.get(getOperand(0), VPIteration(0, 0))
9696                       : State.get(getOperand(0), Part);
9697 
9698       // Collect all the indices for the new GEP. If any index is
9699       // loop-invariant, we won't broadcast it.
9700       SmallVector<Value *, 4> Indices;
9701       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
9702         VPValue *Operand = getOperand(I);
9703         if (IsIndexLoopInvariant[I - 1])
9704           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
9705         else
9706           Indices.push_back(State.get(Operand, Part));
9707       }
9708 
9709       // If the GEP instruction is vectorized and was in a basic block that
9710       // needed predication, we can't propagate the poison-generating 'inbounds'
9711       // flag. The control flow has been linearized and the GEP is no longer
9712       // guarded by the predicate, which could make the 'inbounds' properties to
9713       // no longer hold.
9714       bool IsInBounds =
9715           GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
9716 
9717       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
9718       // but it should be a vector, otherwise.
9719       auto *NewGEP = IsInBounds
9720                          ? State.Builder.CreateInBoundsGEP(
9721                                GEP->getSourceElementType(), Ptr, Indices)
9722                          : State.Builder.CreateGEP(GEP->getSourceElementType(),
9723                                                    Ptr, Indices);
9724       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
9725              "NewGEP is not a pointer vector");
9726       State.set(this, NewGEP, Part);
9727       State.ILV->addMetadata(NewGEP, GEP);
9728     }
9729   }
9730 }
9731 
9732 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9733   assert(!State.Instance && "Int or FP induction being replicated.");
9734   auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
9735   State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV);
9736 }
9737 
9738 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9739   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
9740                                  State);
9741 }
9742 
9743 void VPBlendRecipe::execute(VPTransformState &State) {
9744   State.ILV->setDebugLocFromInst(Phi, &State.Builder);
9745   // We know that all PHIs in non-header blocks are converted into
9746   // selects, so we don't have to worry about the insertion order and we
9747   // can just use the builder.
9748   // At this point we generate the predication tree. There may be
9749   // duplications since this is a simple recursive scan, but future
9750   // optimizations will clean it up.
9751 
9752   unsigned NumIncoming = getNumIncomingValues();
9753 
9754   // Generate a sequence of selects of the form:
9755   // SELECT(Mask3, In3,
9756   //        SELECT(Mask2, In2,
9757   //               SELECT(Mask1, In1,
9758   //                      In0)))
9759   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9760   // are essentially undef are taken from In0.
9761   InnerLoopVectorizer::VectorParts Entry(State.UF);
9762   for (unsigned In = 0; In < NumIncoming; ++In) {
9763     for (unsigned Part = 0; Part < State.UF; ++Part) {
9764       // We might have single edge PHIs (blocks) - use an identity
9765       // 'select' for the first PHI operand.
9766       Value *In0 = State.get(getIncomingValue(In), Part);
9767       if (In == 0)
9768         Entry[Part] = In0; // Initialize with the first incoming value.
9769       else {
9770         // Select between the current value and the previous incoming edge
9771         // based on the incoming mask.
9772         Value *Cond = State.get(getMask(In), Part);
9773         Entry[Part] =
9774             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9775       }
9776     }
9777   }
9778   for (unsigned Part = 0; Part < State.UF; ++Part)
9779     State.set(this, Entry[Part], Part);
9780 }
9781 
9782 void VPInterleaveRecipe::execute(VPTransformState &State) {
9783   assert(!State.Instance && "Interleave group being replicated.");
9784   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9785                                       getStoredValues(), getMask());
9786 }
9787 
9788 void VPReductionRecipe::execute(VPTransformState &State) {
9789   assert(!State.Instance && "Reduction being replicated.");
9790   Value *PrevInChain = State.get(getChainOp(), 0);
9791   RecurKind Kind = RdxDesc->getRecurrenceKind();
9792   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9793   // Propagate the fast-math flags carried by the underlying instruction.
9794   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9795   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9796   for (unsigned Part = 0; Part < State.UF; ++Part) {
9797     Value *NewVecOp = State.get(getVecOp(), Part);
9798     if (VPValue *Cond = getCondOp()) {
9799       Value *NewCond = State.get(Cond, Part);
9800       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9801       Value *Iden = RdxDesc->getRecurrenceIdentity(
9802           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9803       Value *IdenVec =
9804           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9805       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9806       NewVecOp = Select;
9807     }
9808     Value *NewRed;
9809     Value *NextInChain;
9810     if (IsOrdered) {
9811       if (State.VF.isVector())
9812         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9813                                         PrevInChain);
9814       else
9815         NewRed = State.Builder.CreateBinOp(
9816             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9817             NewVecOp);
9818       PrevInChain = NewRed;
9819     } else {
9820       PrevInChain = State.get(getChainOp(), Part);
9821       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9822     }
9823     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9824       NextInChain =
9825           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9826                          NewRed, PrevInChain);
9827     } else if (IsOrdered)
9828       NextInChain = NewRed;
9829     else
9830       NextInChain = State.Builder.CreateBinOp(
9831           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9832           PrevInChain);
9833     State.set(this, NextInChain, Part);
9834   }
9835 }
9836 
9837 void VPReplicateRecipe::execute(VPTransformState &State) {
9838   if (State.Instance) { // Generate a single instance.
9839     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9840     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
9841                                     IsPredicated, State);
9842     // Insert scalar instance packing it into a vector.
9843     if (AlsoPack && State.VF.isVector()) {
9844       // If we're constructing lane 0, initialize to start from poison.
9845       if (State.Instance->Lane.isFirstLane()) {
9846         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9847         Value *Poison = PoisonValue::get(
9848             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9849         State.set(this, Poison, State.Instance->Part);
9850       }
9851       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9852     }
9853     return;
9854   }
9855 
9856   // Generate scalar instances for all VF lanes of all UF parts, unless the
9857   // instruction is uniform inwhich case generate only the first lane for each
9858   // of the UF parts.
9859   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9860   assert((!State.VF.isScalable() || IsUniform) &&
9861          "Can't scalarize a scalable vector");
9862   for (unsigned Part = 0; Part < State.UF; ++Part)
9863     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9864       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9865                                       VPIteration(Part, Lane), IsPredicated,
9866                                       State);
9867 }
9868 
9869 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9870   assert(State.Instance && "Branch on Mask works only on single instance.");
9871 
9872   unsigned Part = State.Instance->Part;
9873   unsigned Lane = State.Instance->Lane.getKnownLane();
9874 
9875   Value *ConditionBit = nullptr;
9876   VPValue *BlockInMask = getMask();
9877   if (BlockInMask) {
9878     ConditionBit = State.get(BlockInMask, Part);
9879     if (ConditionBit->getType()->isVectorTy())
9880       ConditionBit = State.Builder.CreateExtractElement(
9881           ConditionBit, State.Builder.getInt32(Lane));
9882   } else // Block in mask is all-one.
9883     ConditionBit = State.Builder.getTrue();
9884 
9885   // Replace the temporary unreachable terminator with a new conditional branch,
9886   // whose two destinations will be set later when they are created.
9887   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9888   assert(isa<UnreachableInst>(CurrentTerminator) &&
9889          "Expected to replace unreachable terminator with conditional branch.");
9890   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9891   CondBr->setSuccessor(0, nullptr);
9892   ReplaceInstWithInst(CurrentTerminator, CondBr);
9893 }
9894 
9895 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9896   assert(State.Instance && "Predicated instruction PHI works per instance.");
9897   Instruction *ScalarPredInst =
9898       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9899   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9900   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9901   assert(PredicatingBB && "Predicated block has no single predecessor.");
9902   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9903          "operand must be VPReplicateRecipe");
9904 
9905   // By current pack/unpack logic we need to generate only a single phi node: if
9906   // a vector value for the predicated instruction exists at this point it means
9907   // the instruction has vector users only, and a phi for the vector value is
9908   // needed. In this case the recipe of the predicated instruction is marked to
9909   // also do that packing, thereby "hoisting" the insert-element sequence.
9910   // Otherwise, a phi node for the scalar value is needed.
9911   unsigned Part = State.Instance->Part;
9912   if (State.hasVectorValue(getOperand(0), Part)) {
9913     Value *VectorValue = State.get(getOperand(0), Part);
9914     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9915     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9916     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9917     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9918     if (State.hasVectorValue(this, Part))
9919       State.reset(this, VPhi, Part);
9920     else
9921       State.set(this, VPhi, Part);
9922     // NOTE: Currently we need to update the value of the operand, so the next
9923     // predicated iteration inserts its generated value in the correct vector.
9924     State.reset(getOperand(0), VPhi, Part);
9925   } else {
9926     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9927     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9928     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9929                      PredicatingBB);
9930     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9931     if (State.hasScalarValue(this, *State.Instance))
9932       State.reset(this, Phi, *State.Instance);
9933     else
9934       State.set(this, Phi, *State.Instance);
9935     // NOTE: Currently we need to update the value of the operand, so the next
9936     // predicated iteration inserts its generated value in the correct vector.
9937     State.reset(getOperand(0), Phi, *State.Instance);
9938   }
9939 }
9940 
9941 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9942   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9943 
9944   // Attempt to issue a wide load.
9945   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9946   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9947 
9948   assert((LI || SI) && "Invalid Load/Store instruction");
9949   assert((!SI || StoredValue) && "No stored value provided for widened store");
9950   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9951 
9952   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9953 
9954   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9955   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9956   bool CreateGatherScatter = !Consecutive;
9957 
9958   auto &Builder = State.Builder;
9959   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9960   bool isMaskRequired = getMask();
9961   if (isMaskRequired)
9962     for (unsigned Part = 0; Part < State.UF; ++Part)
9963       BlockInMaskParts[Part] = State.get(getMask(), Part);
9964 
9965   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9966     // Calculate the pointer for the specific unroll-part.
9967     GetElementPtrInst *PartPtr = nullptr;
9968 
9969     bool InBounds = false;
9970     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9971       InBounds = gep->isInBounds();
9972     if (Reverse) {
9973       // If the address is consecutive but reversed, then the
9974       // wide store needs to start at the last vector element.
9975       // RunTimeVF =  VScale * VF.getKnownMinValue()
9976       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9977       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9978       // NumElt = -Part * RunTimeVF
9979       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9980       // LastLane = 1 - RunTimeVF
9981       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9982       PartPtr =
9983           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9984       PartPtr->setIsInBounds(InBounds);
9985       PartPtr = cast<GetElementPtrInst>(
9986           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9987       PartPtr->setIsInBounds(InBounds);
9988       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9989         BlockInMaskParts[Part] =
9990             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9991     } else {
9992       Value *Increment =
9993           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9994       PartPtr = cast<GetElementPtrInst>(
9995           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9996       PartPtr->setIsInBounds(InBounds);
9997     }
9998 
9999     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
10000     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
10001   };
10002 
10003   // Handle Stores:
10004   if (SI) {
10005     State.ILV->setDebugLocFromInst(SI);
10006 
10007     for (unsigned Part = 0; Part < State.UF; ++Part) {
10008       Instruction *NewSI = nullptr;
10009       Value *StoredVal = State.get(StoredValue, Part);
10010       if (CreateGatherScatter) {
10011         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
10012         Value *VectorGep = State.get(getAddr(), Part);
10013         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
10014                                             MaskPart);
10015       } else {
10016         if (Reverse) {
10017           // If we store to reverse consecutive memory locations, then we need
10018           // to reverse the order of elements in the stored value.
10019           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
10020           // We don't want to update the value in the map as it might be used in
10021           // another expression. So don't call resetVectorValue(StoredVal).
10022         }
10023         auto *VecPtr =
10024             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
10025         if (isMaskRequired)
10026           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
10027                                             BlockInMaskParts[Part]);
10028         else
10029           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
10030       }
10031       State.ILV->addMetadata(NewSI, SI);
10032     }
10033     return;
10034   }
10035 
10036   // Handle loads.
10037   assert(LI && "Must have a load instruction");
10038   State.ILV->setDebugLocFromInst(LI);
10039   for (unsigned Part = 0; Part < State.UF; ++Part) {
10040     Value *NewLI;
10041     if (CreateGatherScatter) {
10042       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
10043       Value *VectorGep = State.get(getAddr(), Part);
10044       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
10045                                          nullptr, "wide.masked.gather");
10046       State.ILV->addMetadata(NewLI, LI);
10047     } else {
10048       auto *VecPtr =
10049           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
10050       if (isMaskRequired)
10051         NewLI = Builder.CreateMaskedLoad(
10052             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
10053             PoisonValue::get(DataTy), "wide.masked.load");
10054       else
10055         NewLI =
10056             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
10057 
10058       // Add metadata to the load, but setVectorValue to the reverse shuffle.
10059       State.ILV->addMetadata(NewLI, LI);
10060       if (Reverse)
10061         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
10062     }
10063 
10064     State.set(this, NewLI, Part);
10065   }
10066 }
10067 
10068 // Determine how to lower the scalar epilogue, which depends on 1) optimising
10069 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
10070 // predication, and 4) a TTI hook that analyses whether the loop is suitable
10071 // for predication.
10072 static ScalarEpilogueLowering getScalarEpilogueLowering(
10073     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
10074     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10075     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
10076     LoopVectorizationLegality &LVL) {
10077   // 1) OptSize takes precedence over all other options, i.e. if this is set,
10078   // don't look at hints or options, and don't request a scalar epilogue.
10079   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10080   // LoopAccessInfo (due to code dependency and not being able to reliably get
10081   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10082   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10083   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10084   // back to the old way and vectorize with versioning when forced. See D81345.)
10085   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10086                                                       PGSOQueryType::IRPass) &&
10087                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10088     return CM_ScalarEpilogueNotAllowedOptSize;
10089 
10090   // 2) If set, obey the directives
10091   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10092     switch (PreferPredicateOverEpilogue) {
10093     case PreferPredicateTy::ScalarEpilogue:
10094       return CM_ScalarEpilogueAllowed;
10095     case PreferPredicateTy::PredicateElseScalarEpilogue:
10096       return CM_ScalarEpilogueNotNeededUsePredicate;
10097     case PreferPredicateTy::PredicateOrDontVectorize:
10098       return CM_ScalarEpilogueNotAllowedUsePredicate;
10099     };
10100   }
10101 
10102   // 3) If set, obey the hints
10103   switch (Hints.getPredicate()) {
10104   case LoopVectorizeHints::FK_Enabled:
10105     return CM_ScalarEpilogueNotNeededUsePredicate;
10106   case LoopVectorizeHints::FK_Disabled:
10107     return CM_ScalarEpilogueAllowed;
10108   };
10109 
10110   // 4) if the TTI hook indicates this is profitable, request predication.
10111   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
10112                                        LVL.getLAI()))
10113     return CM_ScalarEpilogueNotNeededUsePredicate;
10114 
10115   return CM_ScalarEpilogueAllowed;
10116 }
10117 
10118 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
10119   // If Values have been set for this Def return the one relevant for \p Part.
10120   if (hasVectorValue(Def, Part))
10121     return Data.PerPartOutput[Def][Part];
10122 
10123   if (!hasScalarValue(Def, {Part, 0})) {
10124     Value *IRV = Def->getLiveInIRValue();
10125     Value *B = ILV->getBroadcastInstrs(IRV);
10126     set(Def, B, Part);
10127     return B;
10128   }
10129 
10130   Value *ScalarValue = get(Def, {Part, 0});
10131   // If we aren't vectorizing, we can just copy the scalar map values over
10132   // to the vector map.
10133   if (VF.isScalar()) {
10134     set(Def, ScalarValue, Part);
10135     return ScalarValue;
10136   }
10137 
10138   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
10139   bool IsUniform = RepR && RepR->isUniform();
10140 
10141   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
10142   // Check if there is a scalar value for the selected lane.
10143   if (!hasScalarValue(Def, {Part, LastLane})) {
10144     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
10145     assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
10146            "unexpected recipe found to be invariant");
10147     IsUniform = true;
10148     LastLane = 0;
10149   }
10150 
10151   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
10152   // Set the insert point after the last scalarized instruction or after the
10153   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
10154   // will directly follow the scalar definitions.
10155   auto OldIP = Builder.saveIP();
10156   auto NewIP =
10157       isa<PHINode>(LastInst)
10158           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
10159           : std::next(BasicBlock::iterator(LastInst));
10160   Builder.SetInsertPoint(&*NewIP);
10161 
10162   // However, if we are vectorizing, we need to construct the vector values.
10163   // If the value is known to be uniform after vectorization, we can just
10164   // broadcast the scalar value corresponding to lane zero for each unroll
10165   // iteration. Otherwise, we construct the vector values using
10166   // insertelement instructions. Since the resulting vectors are stored in
10167   // State, we will only generate the insertelements once.
10168   Value *VectorValue = nullptr;
10169   if (IsUniform) {
10170     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
10171     set(Def, VectorValue, Part);
10172   } else {
10173     // Initialize packing with insertelements to start from undef.
10174     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
10175     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
10176     set(Def, Undef, Part);
10177     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
10178       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
10179     VectorValue = get(Def, Part);
10180   }
10181   Builder.restoreIP(OldIP);
10182   return VectorValue;
10183 }
10184 
10185 // Process the loop in the VPlan-native vectorization path. This path builds
10186 // VPlan upfront in the vectorization pipeline, which allows to apply
10187 // VPlan-to-VPlan transformations from the very beginning without modifying the
10188 // input LLVM IR.
10189 static bool processLoopInVPlanNativePath(
10190     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10191     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10192     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10193     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10194     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10195     LoopVectorizationRequirements &Requirements) {
10196 
10197   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10198     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10199     return false;
10200   }
10201   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10202   Function *F = L->getHeader()->getParent();
10203   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10204 
10205   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10206       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10207 
10208   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10209                                 &Hints, IAI);
10210   // Use the planner for outer loop vectorization.
10211   // TODO: CM is not used at this point inside the planner. Turn CM into an
10212   // optional argument if we don't need it in the future.
10213   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10214                                Requirements, ORE);
10215 
10216   // Get user vectorization factor.
10217   ElementCount UserVF = Hints.getWidth();
10218 
10219   CM.collectElementTypesForWidening();
10220 
10221   // Plan how to best vectorize, return the best VF and its cost.
10222   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10223 
10224   // If we are stress testing VPlan builds, do not attempt to generate vector
10225   // code. Masked vector code generation support will follow soon.
10226   // Also, do not attempt to vectorize if no vector code will be produced.
10227   if (VPlanBuildStressTest || EnableVPlanPredication ||
10228       VectorizationFactor::Disabled() == VF)
10229     return false;
10230 
10231   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10232 
10233   {
10234     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10235                              F->getParent()->getDataLayout());
10236     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10237                            &CM, BFI, PSI, Checks);
10238     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10239                       << L->getHeader()->getParent()->getName() << "\"\n");
10240     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
10241   }
10242 
10243   // Mark the loop as already vectorized to avoid vectorizing again.
10244   Hints.setAlreadyVectorized();
10245   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10246   return true;
10247 }
10248 
10249 // Emit a remark if there are stores to floats that required a floating point
10250 // extension. If the vectorized loop was generated with floating point there
10251 // will be a performance penalty from the conversion overhead and the change in
10252 // the vector width.
10253 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10254   SmallVector<Instruction *, 4> Worklist;
10255   for (BasicBlock *BB : L->getBlocks()) {
10256     for (Instruction &Inst : *BB) {
10257       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10258         if (S->getValueOperand()->getType()->isFloatTy())
10259           Worklist.push_back(S);
10260       }
10261     }
10262   }
10263 
10264   // Traverse the floating point stores upwards searching, for floating point
10265   // conversions.
10266   SmallPtrSet<const Instruction *, 4> Visited;
10267   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10268   while (!Worklist.empty()) {
10269     auto *I = Worklist.pop_back_val();
10270     if (!L->contains(I))
10271       continue;
10272     if (!Visited.insert(I).second)
10273       continue;
10274 
10275     // Emit a remark if the floating point store required a floating
10276     // point conversion.
10277     // TODO: More work could be done to identify the root cause such as a
10278     // constant or a function return type and point the user to it.
10279     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10280       ORE->emit([&]() {
10281         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10282                                           I->getDebugLoc(), L->getHeader())
10283                << "floating point conversion changes vector width. "
10284                << "Mixed floating point precision requires an up/down "
10285                << "cast that will negatively impact performance.";
10286       });
10287 
10288     for (Use &Op : I->operands())
10289       if (auto *OpI = dyn_cast<Instruction>(Op))
10290         Worklist.push_back(OpI);
10291   }
10292 }
10293 
10294 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10295     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10296                                !EnableLoopInterleaving),
10297       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10298                               !EnableLoopVectorization) {}
10299 
10300 bool LoopVectorizePass::processLoop(Loop *L) {
10301   assert((EnableVPlanNativePath || L->isInnermost()) &&
10302          "VPlan-native path is not enabled. Only process inner loops.");
10303 
10304 #ifndef NDEBUG
10305   const std::string DebugLocStr = getDebugLocString(L);
10306 #endif /* NDEBUG */
10307 
10308   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
10309                     << L->getHeader()->getParent()->getName() << "\" from "
10310                     << DebugLocStr << "\n");
10311 
10312   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10313 
10314   LLVM_DEBUG(
10315       dbgs() << "LV: Loop hints:"
10316              << " force="
10317              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10318                      ? "disabled"
10319                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10320                             ? "enabled"
10321                             : "?"))
10322              << " width=" << Hints.getWidth()
10323              << " interleave=" << Hints.getInterleave() << "\n");
10324 
10325   // Function containing loop
10326   Function *F = L->getHeader()->getParent();
10327 
10328   // Looking at the diagnostic output is the only way to determine if a loop
10329   // was vectorized (other than looking at the IR or machine code), so it
10330   // is important to generate an optimization remark for each loop. Most of
10331   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10332   // generated as OptimizationRemark and OptimizationRemarkMissed are
10333   // less verbose reporting vectorized loops and unvectorized loops that may
10334   // benefit from vectorization, respectively.
10335 
10336   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10337     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10338     return false;
10339   }
10340 
10341   PredicatedScalarEvolution PSE(*SE, *L);
10342 
10343   // Check if it is legal to vectorize the loop.
10344   LoopVectorizationRequirements Requirements;
10345   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10346                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10347   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10348     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10349     Hints.emitRemarkWithHints();
10350     return false;
10351   }
10352 
10353   // Check the function attributes and profiles to find out if this function
10354   // should be optimized for size.
10355   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10356       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10357 
10358   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10359   // here. They may require CFG and instruction level transformations before
10360   // even evaluating whether vectorization is profitable. Since we cannot modify
10361   // the incoming IR, we need to build VPlan upfront in the vectorization
10362   // pipeline.
10363   if (!L->isInnermost())
10364     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10365                                         ORE, BFI, PSI, Hints, Requirements);
10366 
10367   assert(L->isInnermost() && "Inner loop expected.");
10368 
10369   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10370   // count by optimizing for size, to minimize overheads.
10371   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10372   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10373     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10374                       << "This loop is worth vectorizing only if no scalar "
10375                       << "iteration overheads are incurred.");
10376     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10377       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10378     else {
10379       LLVM_DEBUG(dbgs() << "\n");
10380       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10381     }
10382   }
10383 
10384   // Check the function attributes to see if implicit floats are allowed.
10385   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10386   // an integer loop and the vector instructions selected are purely integer
10387   // vector instructions?
10388   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10389     reportVectorizationFailure(
10390         "Can't vectorize when the NoImplicitFloat attribute is used",
10391         "loop not vectorized due to NoImplicitFloat attribute",
10392         "NoImplicitFloat", ORE, L);
10393     Hints.emitRemarkWithHints();
10394     return false;
10395   }
10396 
10397   // Check if the target supports potentially unsafe FP vectorization.
10398   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10399   // for the target we're vectorizing for, to make sure none of the
10400   // additional fp-math flags can help.
10401   if (Hints.isPotentiallyUnsafe() &&
10402       TTI->isFPVectorizationPotentiallyUnsafe()) {
10403     reportVectorizationFailure(
10404         "Potentially unsafe FP op prevents vectorization",
10405         "loop not vectorized due to unsafe FP support.",
10406         "UnsafeFP", ORE, L);
10407     Hints.emitRemarkWithHints();
10408     return false;
10409   }
10410 
10411   bool AllowOrderedReductions;
10412   // If the flag is set, use that instead and override the TTI behaviour.
10413   if (ForceOrderedReductions.getNumOccurrences() > 0)
10414     AllowOrderedReductions = ForceOrderedReductions;
10415   else
10416     AllowOrderedReductions = TTI->enableOrderedReductions();
10417   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10418     ORE->emit([&]() {
10419       auto *ExactFPMathInst = Requirements.getExactFPInst();
10420       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10421                                                  ExactFPMathInst->getDebugLoc(),
10422                                                  ExactFPMathInst->getParent())
10423              << "loop not vectorized: cannot prove it is safe to reorder "
10424                 "floating-point operations";
10425     });
10426     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10427                          "reorder floating-point operations\n");
10428     Hints.emitRemarkWithHints();
10429     return false;
10430   }
10431 
10432   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10433   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10434 
10435   // If an override option has been passed in for interleaved accesses, use it.
10436   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10437     UseInterleaved = EnableInterleavedMemAccesses;
10438 
10439   // Analyze interleaved memory accesses.
10440   if (UseInterleaved) {
10441     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10442   }
10443 
10444   // Use the cost model.
10445   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10446                                 F, &Hints, IAI);
10447   CM.collectValuesToIgnore();
10448   CM.collectElementTypesForWidening();
10449 
10450   // Use the planner for vectorization.
10451   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10452                                Requirements, ORE);
10453 
10454   // Get user vectorization factor and interleave count.
10455   ElementCount UserVF = Hints.getWidth();
10456   unsigned UserIC = Hints.getInterleave();
10457 
10458   // Plan how to best vectorize, return the best VF and its cost.
10459   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10460 
10461   VectorizationFactor VF = VectorizationFactor::Disabled();
10462   unsigned IC = 1;
10463 
10464   if (MaybeVF) {
10465     VF = *MaybeVF;
10466     // Select the interleave count.
10467     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10468   }
10469 
10470   // Identify the diagnostic messages that should be produced.
10471   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10472   bool VectorizeLoop = true, InterleaveLoop = true;
10473   if (VF.Width.isScalar()) {
10474     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10475     VecDiagMsg = std::make_pair(
10476         "VectorizationNotBeneficial",
10477         "the cost-model indicates that vectorization is not beneficial");
10478     VectorizeLoop = false;
10479   }
10480 
10481   if (!MaybeVF && UserIC > 1) {
10482     // Tell the user interleaving was avoided up-front, despite being explicitly
10483     // requested.
10484     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10485                          "interleaving should be avoided up front\n");
10486     IntDiagMsg = std::make_pair(
10487         "InterleavingAvoided",
10488         "Ignoring UserIC, because interleaving was avoided up front");
10489     InterleaveLoop = false;
10490   } else if (IC == 1 && UserIC <= 1) {
10491     // Tell the user interleaving is not beneficial.
10492     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10493     IntDiagMsg = std::make_pair(
10494         "InterleavingNotBeneficial",
10495         "the cost-model indicates that interleaving is not beneficial");
10496     InterleaveLoop = false;
10497     if (UserIC == 1) {
10498       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10499       IntDiagMsg.second +=
10500           " and is explicitly disabled or interleave count is set to 1";
10501     }
10502   } else if (IC > 1 && UserIC == 1) {
10503     // Tell the user interleaving is beneficial, but it explicitly disabled.
10504     LLVM_DEBUG(
10505         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10506     IntDiagMsg = std::make_pair(
10507         "InterleavingBeneficialButDisabled",
10508         "the cost-model indicates that interleaving is beneficial "
10509         "but is explicitly disabled or interleave count is set to 1");
10510     InterleaveLoop = false;
10511   }
10512 
10513   // Override IC if user provided an interleave count.
10514   IC = UserIC > 0 ? UserIC : IC;
10515 
10516   // Emit diagnostic messages, if any.
10517   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10518   if (!VectorizeLoop && !InterleaveLoop) {
10519     // Do not vectorize or interleaving the loop.
10520     ORE->emit([&]() {
10521       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10522                                       L->getStartLoc(), L->getHeader())
10523              << VecDiagMsg.second;
10524     });
10525     ORE->emit([&]() {
10526       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10527                                       L->getStartLoc(), L->getHeader())
10528              << IntDiagMsg.second;
10529     });
10530     return false;
10531   } else if (!VectorizeLoop && InterleaveLoop) {
10532     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10533     ORE->emit([&]() {
10534       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10535                                         L->getStartLoc(), L->getHeader())
10536              << VecDiagMsg.second;
10537     });
10538   } else if (VectorizeLoop && !InterleaveLoop) {
10539     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10540                       << ") in " << DebugLocStr << '\n');
10541     ORE->emit([&]() {
10542       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10543                                         L->getStartLoc(), L->getHeader())
10544              << IntDiagMsg.second;
10545     });
10546   } else if (VectorizeLoop && InterleaveLoop) {
10547     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10548                       << ") in " << DebugLocStr << '\n');
10549     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10550   }
10551 
10552   bool DisableRuntimeUnroll = false;
10553   MDNode *OrigLoopID = L->getLoopID();
10554   {
10555     // Optimistically generate runtime checks. Drop them if they turn out to not
10556     // be profitable. Limit the scope of Checks, so the cleanup happens
10557     // immediately after vector codegeneration is done.
10558     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10559                              F->getParent()->getDataLayout());
10560     if (!VF.Width.isScalar() || IC > 1)
10561       Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
10562 
10563     using namespace ore;
10564     if (!VectorizeLoop) {
10565       assert(IC > 1 && "interleave count should not be 1 or 0");
10566       // If we decided that it is not legal to vectorize the loop, then
10567       // interleave it.
10568       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10569                                  &CM, BFI, PSI, Checks);
10570 
10571       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10572       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
10573 
10574       ORE->emit([&]() {
10575         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10576                                   L->getHeader())
10577                << "interleaved loop (interleaved count: "
10578                << NV("InterleaveCount", IC) << ")";
10579       });
10580     } else {
10581       // If we decided that it is *legal* to vectorize the loop, then do it.
10582 
10583       // Consider vectorizing the epilogue too if it's profitable.
10584       VectorizationFactor EpilogueVF =
10585           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10586       if (EpilogueVF.Width.isVector()) {
10587 
10588         // The first pass vectorizes the main loop and creates a scalar epilogue
10589         // to be vectorized by executing the plan (potentially with a different
10590         // factor) again shortly afterwards.
10591         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10592         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10593                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10594 
10595         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10596         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10597                         DT);
10598         ++LoopsVectorized;
10599 
10600         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10601         formLCSSARecursively(*L, *DT, LI, SE);
10602 
10603         // Second pass vectorizes the epilogue and adjusts the control flow
10604         // edges from the first pass.
10605         EPI.MainLoopVF = EPI.EpilogueVF;
10606         EPI.MainLoopUF = EPI.EpilogueUF;
10607         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10608                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10609                                                  Checks);
10610 
10611         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10612 
10613         // Ensure that the start values for any VPReductionPHIRecipes are
10614         // updated before vectorising the epilogue loop.
10615         VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock();
10616         for (VPRecipeBase &R : Header->phis()) {
10617           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10618             if (auto *Resume = MainILV.getReductionResumeValue(
10619                     ReductionPhi->getRecurrenceDescriptor())) {
10620               VPValue *StartVal = new VPValue(Resume);
10621               BestEpiPlan.addExternalDef(StartVal);
10622               ReductionPhi->setOperand(0, StartVal);
10623             }
10624           }
10625         }
10626 
10627         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10628                         DT);
10629         ++LoopsEpilogueVectorized;
10630 
10631         if (!MainILV.areSafetyChecksAdded())
10632           DisableRuntimeUnroll = true;
10633       } else {
10634         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10635                                &LVL, &CM, BFI, PSI, Checks);
10636 
10637         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10638         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
10639         ++LoopsVectorized;
10640 
10641         // Add metadata to disable runtime unrolling a scalar loop when there
10642         // are no runtime checks about strides and memory. A scalar loop that is
10643         // rarely used is not worth unrolling.
10644         if (!LB.areSafetyChecksAdded())
10645           DisableRuntimeUnroll = true;
10646       }
10647       // Report the vectorization decision.
10648       ORE->emit([&]() {
10649         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10650                                   L->getHeader())
10651                << "vectorized loop (vectorization width: "
10652                << NV("VectorizationFactor", VF.Width)
10653                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10654       });
10655     }
10656 
10657     if (ORE->allowExtraAnalysis(LV_NAME))
10658       checkMixedPrecision(L, ORE);
10659   }
10660 
10661   Optional<MDNode *> RemainderLoopID =
10662       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10663                                       LLVMLoopVectorizeFollowupEpilogue});
10664   if (RemainderLoopID.hasValue()) {
10665     L->setLoopID(RemainderLoopID.getValue());
10666   } else {
10667     if (DisableRuntimeUnroll)
10668       AddRuntimeUnrollDisableMetaData(L);
10669 
10670     // Mark the loop as already vectorized to avoid vectorizing again.
10671     Hints.setAlreadyVectorized();
10672   }
10673 
10674   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10675   return true;
10676 }
10677 
10678 LoopVectorizeResult LoopVectorizePass::runImpl(
10679     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10680     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10681     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10682     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10683     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10684   SE = &SE_;
10685   LI = &LI_;
10686   TTI = &TTI_;
10687   DT = &DT_;
10688   BFI = &BFI_;
10689   TLI = TLI_;
10690   AA = &AA_;
10691   AC = &AC_;
10692   GetLAA = &GetLAA_;
10693   DB = &DB_;
10694   ORE = &ORE_;
10695   PSI = PSI_;
10696 
10697   // Don't attempt if
10698   // 1. the target claims to have no vector registers, and
10699   // 2. interleaving won't help ILP.
10700   //
10701   // The second condition is necessary because, even if the target has no
10702   // vector registers, loop vectorization may still enable scalar
10703   // interleaving.
10704   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10705       TTI->getMaxInterleaveFactor(1) < 2)
10706     return LoopVectorizeResult(false, false);
10707 
10708   bool Changed = false, CFGChanged = false;
10709 
10710   // The vectorizer requires loops to be in simplified form.
10711   // Since simplification may add new inner loops, it has to run before the
10712   // legality and profitability checks. This means running the loop vectorizer
10713   // will simplify all loops, regardless of whether anything end up being
10714   // vectorized.
10715   for (auto &L : *LI)
10716     Changed |= CFGChanged |=
10717         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10718 
10719   // Build up a worklist of inner-loops to vectorize. This is necessary as
10720   // the act of vectorizing or partially unrolling a loop creates new loops
10721   // and can invalidate iterators across the loops.
10722   SmallVector<Loop *, 8> Worklist;
10723 
10724   for (Loop *L : *LI)
10725     collectSupportedLoops(*L, LI, ORE, Worklist);
10726 
10727   LoopsAnalyzed += Worklist.size();
10728 
10729   // Now walk the identified inner loops.
10730   while (!Worklist.empty()) {
10731     Loop *L = Worklist.pop_back_val();
10732 
10733     // For the inner loops we actually process, form LCSSA to simplify the
10734     // transform.
10735     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10736 
10737     Changed |= CFGChanged |= processLoop(L);
10738   }
10739 
10740   // Process each loop nest in the function.
10741   return LoopVectorizeResult(Changed, CFGChanged);
10742 }
10743 
10744 PreservedAnalyses LoopVectorizePass::run(Function &F,
10745                                          FunctionAnalysisManager &AM) {
10746     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10747     auto &LI = AM.getResult<LoopAnalysis>(F);
10748     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10749     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10750     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10751     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10752     auto &AA = AM.getResult<AAManager>(F);
10753     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10754     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10755     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10756 
10757     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10758     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10759         [&](Loop &L) -> const LoopAccessInfo & {
10760       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10761                                         TLI, TTI, nullptr, nullptr, nullptr};
10762       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10763     };
10764     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10765     ProfileSummaryInfo *PSI =
10766         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10767     LoopVectorizeResult Result =
10768         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10769     if (!Result.MadeAnyChange)
10770       return PreservedAnalyses::all();
10771     PreservedAnalyses PA;
10772 
10773     // We currently do not preserve loopinfo/dominator analyses with outer loop
10774     // vectorization. Until this is addressed, mark these analyses as preserved
10775     // only for non-VPlan-native path.
10776     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10777     if (!EnableVPlanNativePath) {
10778       PA.preserve<LoopAnalysis>();
10779       PA.preserve<DominatorTreeAnalysis>();
10780     }
10781 
10782     if (Result.MadeCFGChange) {
10783       // Making CFG changes likely means a loop got vectorized. Indicate that
10784       // extra simplification passes should be run.
10785       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10786       // be run if runtime checks have been added.
10787       AM.getResult<ShouldRunExtraVectorPasses>(F);
10788       PA.preserve<ShouldRunExtraVectorPasses>();
10789     } else {
10790       PA.preserveSet<CFGAnalyses>();
10791     }
10792     return PA;
10793 }
10794 
10795 void LoopVectorizePass::printPipeline(
10796     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10797   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10798       OS, MapClassName2PassName);
10799 
10800   OS << "<";
10801   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10802   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10803   OS << ">";
10804 }
10805